In [1]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import copy

import sklearn
import numpy as np 
import pandas as pd 
import os
import scipy
from scipy.stats import chi2_contingency, ttest_ind, f_oneway, levene
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import seaborn
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import hist
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.decomposition import PCA
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.metrics import classification_report, balanced_accuracy_score, roc_curve


pd.set_option("max_colwidth", None)

import imblearn
# import optunity

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import f_classif, chi2, SelectKBest, RFECV


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, RandomizedSearchCV, cross_validate , RepeatedKFold


from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB, CategoricalNB

from scipy.stats import randint

from sklearn.svm import SVC

In [2]:
path = "../input/hr-analytics-job-change-of-data-scientists"
_, test, train = os.listdir("../input/hr-analytics-job-change-of-data-scientists")
train_df = pd.read_csv(path + "/" + train)
test_df = pd.read_csv(path + "/" + test)
print("Size of training set: ", train_df.shape[0])
train_df.head(3)

In [3]:
''' 
city                     Categorical Nominal Variable    (many possible values)
city_development_index   Quantitative Continous Variable (many possible values)
gender                   Categorical Nominal Variable    (3 possible values)    (the 4th value is nan!!)
relevent_experience      Categorical Nominal Variable    (2 possible values)
enrolled_university      Categorical Ordinal Variable    (3 possible values)    (the 4th value is nan!!)
education_level          Categorical Ordinal Variable    (5 possible values)    (the 6th value is nan!!)
major_discipline         Categorical Nominal Variable    (6 possible values)    (the 7th value is nan!!)
experience               Quantitative Discrete Variable                         (has nan!!)
company_size             Categorical Ordinal Variable                           (has nan!!)
company_type             Categorical Nominal Variable    (6 possible values)    (the 7th value is nan!!)
last_new_job             Categorical Ordinal Variable    (6 possible values)    (the 7th value is nan!!)
training_hours           Quantitave Continous Variable   (many possible values)
target                   Binary Output Variabile 0/1 
'''

for column in train_df.columns:
    uniques = train_df[column].unique()
    print("Column: \'" + column + '\'')
    print("Domain of \'" + column + "\':", uniques)
    print("Number of unique values:", len(uniques))
    if np.nan in uniques.tolist():
        print(column + " has nan: Yes")
    else:
        print(column + " has nan: No")
    print()

In [4]:
train_df.isna().sum()

In [5]:
quantitative = ['city_development_index', 'experience', 'training_hours']
categorical = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'company_size',
               'company_type', 'last_new_job', 'target']
_ = dict(zip(categorical, len(categorical) * ["miss"]))
train_df.fillna(value=_, inplace=True)

In [6]:
# Chi-square test of independence of variables
# H_0 = the variables are independent
for i in range(0, len(categorical)):
    for j in range(i + 1, len(categorical)):
        contingency_table = pd.crosstab(train_df[categorical[i]][train_df[categorical[i]] != 'miss'], 
                                        train_df[categorical[j]][train_df[categorical[j]] != 'miss'])
        chi_statistic, p_value, df, expected_table = chi2_contingency(contingency_table)
        alpha = 0.05
        if p_value < alpha:
            print(categorical[i] + " and " + categorical[j] + ": we reject H_0")
        else:
            print(categorical[i] + " and " + categorical[j] + ": we DON'T reject H_0")

In [7]:
# Inference on 2 population means, sigmas not known 
# Therefore, t-test
for column_1 in ("relevent_experience", "target"):
    for column_2 in ("training_hours", "city_development_index"):
        uniques = train_df[column_1].unique()
        sample_pop_1 = train_df[[column_2]][train_df[column_1] == uniques[0]]
        sample_pop_2 = train_df[column_2][train_df[column_1] == uniques[1]]
        print("Sample size from first population: " + str(len(sample_pop_1)))
        print("Sample size from second population: " + str(len(sample_pop_2)))
        print("The samples are independent simple random samples ✔")
        print("The populations are normally distributed or sample size is big enough (CLT applies) ✔")
        var_1 = float(sample_pop_1.var())
        var_2 = float(sample_pop_2.var())
        if var_1 < var_2:
            aux = var_1
            var_1 = var_2
            var_2 = var_1
        F_statistic = var_1 / var_2
        df1 = len(sample_pop_1) - 1
        df2 = len(sample_pop_2) - 1
        p_value = scipy.stats.f.sf(F_statistic, df1, df2)
        alpha = 0.05
        ok = 0
        if p_value < alpha:
            print("Population variances are not equal ✖")
            ok = 1
        else:
            print("Population variances are equal ✔")
        t_test =  ttest_ind(sample_pop_1, sample_pop_2, equal_var=(lambda elem : False if elem == 1 else True)(ok)) # basci t-test / Welch t test
        if t_test.pvalue[0] < alpha:
            print(column_1 + ", " + column_2 + ": Population means are not equal ✖", end="\n\n")
        else:
            print(column_1 + ", " + column_2 + ": Population means are equal ✔", end="\n\n")

In [8]:
# Inference on 3 or more population means, sigmas not known
# ANOVA one-way
for column_1 in ('training_hours', 'city_development_index'):
    for column_2 in ('gender', 'enrolled_university', 'education_level', 'major_discipline', 'company_type', 'last_new_job'):
        uniques = train_df[column_2].unique().tolist()
        del uniques[uniques.index('miss')]
        variants = []
        for value in uniques:
            variants.append(train_df[column_1][train_df[column_2] == value])
        print("Sample sizes from our populations: ", [len(_) for _ in variants])
        print("The samples are independent simple random samples ✔")
        print("The populations are normally distributed or sample size is big enough (CLT applies) ✔")
        levene_test = levene(*variants)
        if levene_test.pvalue < alpha:
            print("Population variances are not equal ✖", end="\n\n")
        else:
            print("Population variances are equal ✔")
            anova_oneway = f_oneway(*variants)
            if anova_oneway.pvalue < alpha:
                print("Population means are not equal ✖", end="\n\n")
            else:
                print(column_1 + ", " + column_2 + ": Population means are equal ✔", end="\n\n")
    

In [9]:
stats = train_df[['city_development_index', 'training_hours']].describe()
stats.loc['median'] = train_df[['city_development_index', 'training_hours']].median()
stats.loc['mod'] = train_df[['city_development_index', 'training_hours']].mode().loc[0]
stats

In [10]:
f = plt.figure(figsize=(5, 5))
plt.matshow(train_df[["city_development_index", "training_hours"]].corr(), fignum=f.number)
plt.xticks(range(0, 2), ["city_development_index", "training_hours"], fontsize=14, rotation=10)
plt.yticks(range(0, 2), ["city_development_index", "training_hours"], fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)

In [11]:
# simple histograms
fig = plt.figure(figsize = (15,5))
ax = fig.gca()
_ = train_df.hist(column=["city_development_index", "training_hours"], bins = 10, ax=ax)

In [12]:
# simple boxplots
plt.figure(figsize = (15,5)).subplots_adjust(hspace=.5) # subplots_adjust adds space between histograms
nr = 0
for i in range(0, len(quantitative)):
    if quantitative[i] == 'experience':
        continue
    nr += 1
    plt.subplot(1, 2, nr)
    plt.boxplot(x=train_df[quantitative[i]].tolist())
    plt.title(quantitative[i])
plt.show()

In [13]:
# conditioned histograms
plt.figure(figsize = (25, 25)).subplots_adjust(hspace=.5) # subplots_adjust adds space between plots
nr = 0
for i in range(0, len(quantitative)):
    if quantitative[i] == 'experience':
        continue
    for j in range(0, len(categorical)):
        if categorical[j] == 'city':
            continue
        nr += 1
        plt.subplot(6, 3, nr)
        labels = train_df[categorical[j]].unique().tolist()
        hist_list = []
        for label in labels:
            hist_list.append(train_df[quantitative[i]][train_df[categorical[j]] == label])
        bins = 10
        for index in range(0, len(hist_list)):
            plt.hist(hist_list[index], bins, alpha=0.5, label=labels[index])
        plt.title(quantitative[i] + " conditioned by:" + categorical[j])
        plt.legend(loc='best')
plt.show()

In [14]:
# scatter plot
fig = plt.figure(figsize = (15,5))
ax = fig.gca()
_ = train_df[train_df['target']==1].plot.scatter(x="city_development_index", y="training_hours", c="DarkBlue", ax=ax, label="target=1")
_ = train_df[train_df['target']==0].plot.scatter(x="city_development_index", y="training_hours", c="LightGreen", ax=ax, label="target=0")

In [15]:
fig = plt.figure(figsize=(15, 5))
ax = plt.axes(projection="3d")
ax.scatter3D(xs=train_df['city_development_index'], ys=train_df['training_hours'], zs=np.random.normal(0, 5, (1, len(train_df))), color="red")
plt.title('simple 3D scatter plot')
plt.show()

In [16]:
'''
# conditioned boxplot
fig = plt.figure(figsize = (15,5))
ax = fig.gca()
_ = train_df.boxplot(column='training_hours', ax=ax, by='gender')
'''

# conditioned boxplots
for i in range(0, len(quantitative)):
    if quantitative[i] == 'experience':
        continue
    for j in range(0, len(categorical)):
        if categorical[j] == 'city':
            continue
        fig = plt.figure(figsize = (17,5))
        ax = fig.gca()
        _ = train_df.boxplot(column=quantitative[i], ax=ax, by=categorical[j])

In [17]:
# conditioned piechart
plt.figure(figsize = (20,20)).subplots_adjust(hspace=.5) # subplots_adjust adds space between plots
for i in range(0, len(categorical)):
    if categorical[i] == 'city':
        continue
    plt.subplot(3, 3, i)
    groupby_obj = train_df.groupby(by=categorical[i])
    labels = train_df[categorical[i]].unique().tolist()
    pie_list = []
    for label in labels:
        pie_list.append(len(groupby_obj.get_group(label)))
    patches, texts = plt.pie(pie_list, startangle=90, frame=True)
    plt.legend(patches, labels, loc='best')
    plt.title(categorical[i])
    plt.axis('equal')
    plt.tight_layout()
plt.show()

In [18]:
# bar plots
plt.figure(figsize = (20,20)).subplots_adjust(hspace=.5) # subplots_adjust adds space between histograms
for i in range(0, len(categorical)):
    if categorical[i] == 'city':
        continue
    plt.subplot(3, 3, i)
    labels = train_df[categorical[i]].unique().tolist()
    bar_list = []
    for label in labels:
        bar_list.append(len(train_df[categorical[i]][train_df[categorical[i]] == label]))
    plt.bar(labels, bar_list)
    plt.title(categorical[i])
    plt.xticks(rotation = 45) # rotates the labels a little bit so that they don't overlap
plt.show()

In [19]:
df_tsne_x = train_df[['city_development_index', 'training_hours']]
df_tsne_y = train_df['target']

tsne = TSNE(n_components=2, n_iter=300)
tsne_results = tsne.fit_transform(X = df_tsne_x, y = df_tsne_y)

sns.scatterplot(x=tsne_results[:,0], y = tsne_results[:,1], hue=train_df['target'])

In [20]:
pca = PCA(n_components=2)
pca_data = train_df[['city_development_index', 'training_hours']]
pca_res = pca.fit_transform(pca_data)

pca_df = pd.DataFrame(data = pca_res, columns = ['principal component 1', 'principal component 2'])

pca_df.tail()

In [21]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [22]:
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis",fontsize=20)
targets = [0.0, 1.0]
colors = ['r', 'g']

for target, color in zip(targets,colors):
    indicesToKeep = train_df['target'] == target
    plt.scatter(pca_df.loc[indicesToKeep, 'principal component 1']
               , pca_df.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)
    
plt.legend(targets,prop={'size': 10})

In [23]:
# Prepare a balanced training set for the association rules
train_df['target'] = train_df['target'].apply(lambda elem : '0' if elem == 0.0 else '1') # I've done this because "get_dummies" doesn't work with numerical binary fields
train_df_asssoc_rules = train_df[train_df['target'] == '0'].sample(len(train_df[train_df['target'] == '1']), replace=False).append(train_df[train_df['target'] == '1'])
train_df_asssoc_rules

In [24]:
experiment = pd.get_dummies(train_df_asssoc_rules[['gender', 'relevent_experience', 'enrolled_university', 'education_level', 
                                                   'major_discipline', 'company_type', 'last_new_job', 'target']])
experiment.head(5)

In [25]:
support_table = apriori(experiment, min_support=0.3, use_colnames=True)
support_table.sort_values(by='support', axis=0, ascending=False, inplace=True)
support_table

In [26]:
apriori_table = association_rules(support_table, metric="confidence", min_threshold=0.3)

In [27]:
apriori_table.sort_values(by='support', axis=0, ascending=False)

In [28]:
apriori_table.sort_values(by='confidence', axis=0, ascending=False)

In [29]:
apriori_table.sort_values(by='lift', axis=0, ascending=False)

In [30]:
# Association rules for the output column

apriori_table_target = pd.DataFrame(data=None, index=None, columns=apriori_table.columns)
for i in range(0, len(apriori_table)):
    if ('target_0' in set(apriori_table.loc[i]['consequents']) or 'target_1' in set(apriori_table.loc[i]['consequents'])) and len(set(apriori_table.loc[i]['consequents'])) == 1:
        apriori_table_target = apriori_table_target.append(apriori_table.loc[i])
        
apriori_table_target = apriori_table_target.sort_values(by='confidence', axis=0, ascending=False)
apriori_table_target['antecedents'] = apriori_table_target['antecedents'].apply(func=lambda elem : tuple(elem))
apriori_table_target['consequents'] = apriori_table_target['consequents'].apply(func=lambda elem : tuple(elem))
apriori_table_target

In [31]:
# The rules with the highest confidence in relation to the output column
'''

Short commentary about interesting foundings:

The majority of people (57%) who are not enrolled at the moment in anything related to school/uni that have relevant experience don't want
to change their job. However, these kind of people are in a minority in our training set (only 31% of the records/observations have the 
mentioned above characteristics).

Most of the man prefer stability, so they don't want to change their job.

In general, the people who want to change their job are having a BSc Diploma, and the subject of the BSc is STEM related. Having a BSc in
STEM and wanting to change your job to Data Science is not surprising, but the fact that most of the correspondents are not willing to
specialize by doing a MSc or a PhD only proves that people still don't understand the proper requirements a field like DS has. Probably,
most of the participants in this sample are focusing on APIs, rather than learning more math.

'''

apriori_table_target[apriori_table_target['confidence'] > 0.5]

In [32]:
# Visualisation of the association rules in relation to the output column
heatmap_assoc = dict()
for metric in ('support', 'confidence', 'lift'):
    heatmap_assoc[metric] = apriori_table_target.pivot(index='antecedents', columns='consequents', values=metric)

for metric in heatmap_assoc.keys():
    fig = plt.figure(figsize = (15,5))
    ax = fig.gca()
    seaborn.heatmap(heatmap_assoc[metric], ax=ax).set_title("By " + metric)

# ------------------------------------------------------------------------------------------------------------------------------------------------------
# Classification
# ------------------------------------------------------------------------------------------------------------------------------------------------------

In [33]:
train_df.drop(columns=['enrollee_id'], inplace=True)

In [34]:
from IPython.display import Image
Image(filename="../input/poze-explicatii/feature_selection.png", width= 700, height=500)

In [35]:
X = train_df[[quantitative[0], quantitative[2]]] 
Y = train_df[train_df.columns[-1]]
select_k_best = SelectKBest(f_classif, k=2).fit(X, Y)
print(select_k_best.scores_) # F-value for each column; 
                             # BIG F-value => SMALL p-value => REJECT H_0 (a.k.a means are not equal) => the split of the values of a feature according to the label is signif
X = select_k_best.transform(X) # Only the first k features were selected according to the descending ordder of their corresponding F-values 
print(X)

In [36]:
train_df_chi2 = copy.deepcopy(train_df)
for elem in categorical[:-1]:
    train_df_chi2[elem] = train_df_chi2[elem].astype('category')
cat = train_df_chi2.select_dtypes(['category']).columns
train_df_chi2[categorical[:-1]] = train_df_chi2[categorical[:-1]].apply(lambda elem : elem.cat.codes)

In [37]:
X = train_df_chi2[categorical[:-1]]
Y = train_df_chi2[train_df_chi2.columns[-1]]
select_k_best = SelectKBest(chi2, k=5).fit(X, Y)
print(select_k_best.scores_)
X = select_k_best.transform(X)
print(X)

In [38]:
copie_original = copy.deepcopy(train_df)

for elem in categorical[:-1]:
    train_df[elem] = train_df[elem].astype('category')

train_df['target'] = train_df['target'].astype('int')

In [39]:
train_df = pd.get_dummies(train_df, columns=categorical[:-1] + ['experience'])

scaler = StandardScaler().fit(train_df[['city_development_index', 'training_hours']]).transform(train_df[['city_development_index', 'training_hours']])
train_df['city_development_index'] = scaler[:, 0]
train_df['training_hours'] = scaler[:, 1]

train_df

In [40]:
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680

DT_params = RandomizedSearchCV(estimator=DecisionTreeClassifier(class_weight='balanced'), param_distributions={'max_depth':         randint(5, 20),
                                                                                                               'min_samples_split': randint(1, 40),
                                                                                                               'min_samples_leaf':  randint(1, 20)},
                               n_iter=1000, n_jobs=-1, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=5))

filt = list(filter(lambda elem : elem != 'target', train_df.columns))            
result_DT = DT_params.fit(train_df[filt], train_df['target'])
pd.DataFrame(result_DT.cv_results_).loc[[result_DT.best_index_]]

In [41]:
# help(sklearn.tree._tree.Tree)

In [42]:
# https://machinelearningmastery.com/rfe-feature-selection-in-python/

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5)
filt = list(filter(lambda elem : elem != 'target', train_df.columns))
selector = RFECV(result_DT.best_estimator_, step=1, min_features_to_select=10, cv=cv).fit(train_df[filt], train_df['target'])
print(selector.grid_scores_)
print(selector.support_)
for i in range(0, len(selector.support_)):
    if selector.support_[i] == False:
        train_df.drop(columns=filt[i], inplace=True)

In [43]:

class ANN:
    
    
    def __init__(self, train, valid, test, batch_size, epochs, learning_rate, nr_labels, neurons_per_layer: list, activation, 
                 optimizer, regularization, lambd, dropout):
        
        self.batch_size = batch_size; self.epochs = epochs; self.etha = learning_rate; self.nr_labels = nr_labels
        self.neurons_per_layer = neurons_per_layer
        
        self.X_train = train[0]; self.Y_train = self.reshape_Y(train[1])
        self.X_valid = valid[0]; self.Y_valid = self.reshape_Y(valid[1])
        self.X_test = test[0];   self.Y_test = self.reshape_Y(test[1])
        self.activation = activation; self.optimizer = optimizer.lower()
        self.regularization = regularization; self.lambd = lambd
        
        if dropout is not None:
            for key in dropout.keys():
                dropout[key] = np.random.random_integers(low=0, high=self.neurons_per_layer[key]-1,
                                                         size=(int)(dropout[key] * self.neurons_per_layer[key]))
        self.dropout = dropout
        
        self.batches = self.split_in_batches(self.X_train, self.Y_train, self.batch_size)
        
        self.a = [None for _ in range(0, len(self.neurons_per_layer))]
        self.W = [np.random.normal(0, np.sqrt(1/self.neurons_per_layer[i]), (self.neurons_per_layer[i+1], self.neurons_per_layer[i]))
                  for i in range(0, len(self.neurons_per_layer) - 1)]
        self.W.insert(0, None)
        
        if self.optimizer == 'momentum':
            self.gamma, self.history = self.auxiliar_vars('momentum')
        elif self.optimizer == 'adagrad':
            self.epsilon, self.history = self.auxiliar_vars('adagrad')
        elif self.optimizer == 'rmsprop':
            self.epsilon, self.beta, self.history = self.auxiliar_vars('rmsprop')
    
    
    def auxiliar_vars(self, optimizer):
        history = [0 for _ in range(0, len(self.W))]
        epsilon = 0.01
        if optimizer == 'momentum':
            gamma = 0.9
            return (gamma, history)
        if optimizer == 'rmsprop':
            beta = 0.99
            return (epsilon, beta, history)
        return (epsilon, history)
    
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    
    def sigmoid_deriv(self, a):
        return np.multiply(a, (1 - a))
    
    
    def tanh(self, z):
        return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
    
    
    def tanh_deriv(self, z):
        return 1 - self.tanh(z) ** 2
    
    
    def relu(self, z):
        return np.array([list(map(lambda elem : 0 if elem <= 0 else elem, line)) for line in z])
    
    
    def relu_deriv(self, a):
        return np.array([list(map(lambda elem: 0 if elem == 0 else 1, line)) for line in a])
    
    
    def softmax(self, z):
        return np.exp(z) / np.sum(np.exp(z), axis=0)
    
    
    def softmax_deriv(self, a):
        return np.multiply(a, 1 - a)
    
    
    def logistic_loss(self, last_layer, real_outputs):
        rez = 0 - np.multiply(real_outputs, np.log(last_layer)) + np.multiply((1 - real_outputs), np.log(1 - last_layer))
        return sum(np.sum(rez, axis=0))
    
    
    def logistic_loss_deriv(self, last_layer, real_outputs):
        return (last_layer - real_outputs) / np.multiply(last_layer, 1 - last_layer)
    
    
    def forward_propagation(self, X):
        self.a[0] = X
        for i in range(1, len(self.a) - 1):
            z = np.dot(self.W[i], self.a[i-1])
            if self.activation == 'sigmoid':
                self.a[i] = self.sigmoid(z)
            elif self.activation == 'tanh':
                self.a[i] = self.tanh(z)
            elif self.activation == 'relu':
                self.a[i] = self.relu(z)
        i += 1
        z = np.dot(self.W[i], self.a[i-1])
        self.a[i] = self.softmax(z)

        
    def back_propagation(self, real_outputs):
        W_before = copy.deepcopy(self.W)
        gradients = []
        error = (1 / self.batch_size) * (self.a[-1] - real_outputs)
        if self.regularization == 'L2':
            gradient = np.dot(error, self.a[-2].T) + self.lambd * (1 / self.batch_size) * self.W[-1]
        elif self.regularization == 'L1':
            gradient = np.dot(error, self.a[-2].T) + self.lambd * (1 / self.batch_size) * np.sign(self.W[-1])
        gradients.append(gradient)
        
        for i in range(len(self.a) - 2, 0, -1):
            if self.activation == 'sigmoid':
                error = np.multiply(np.dot(error.T, self.W[i+1]).T, self.sigmoid_deriv(self.a[i]))
            elif self.activation == 'tanh':
                error = np.multiply(np.dot(error.T, self.W[i+1]).T, self.tanh_deriv(self.a[i]))
            elif self.activation == 'relu':
                error = np.multiply(np.dot(error.T, self.W[i+1]).T, self.relu_deriv(self.a[i]))
            if self.regularization == 'L2':
                gradient = np.dot(error, self.a[i-1].T) + self.lambd * (1 / self.batch_size) * self.W[i]
            elif self.regularization == 'L1':
                gradient = np.dot(error, self.a[i-1].T) + self.lambd * (1 / self.batch_size) * np.sign(self.W[i])
            gradients.append(gradient)
        
        gradients = list(reversed(gradients))
        gradients.insert(0, None)
        
        for i in range(1, len(self.a)):
            if self.optimizer == 'momentum':
                self.history[i] = self.gamma * self.history[i] + self.etha * gradients[i]
                self.W[i] -= self.history[i]
            else:
                self.W[i] -=  (self.etha / np.sqrt(self.history[i] + self.epsilon)) * gradients[i]
                if self.optimizer == 'adagrad':
                    self.history[i] += gradients[i] ** 2
                elif self.optimizer == 'rmsprop':
                    self.history[i] = self.beta * self.history[i] + (1 - self.beta) * gradients[i] ** 2
        
        if self.dropout is not None:
            for i in range(1, len(self.a) - 1):
                if i in self.dropout.keys():
                    for neuron in self.dropout[i]:
                        self.W[i + 1][:, neuron] = copy.deepcopy(W_before[i + 1][:, neuron])
                        self.W[i][neuron, :] = copy.deepcopy(W_before[i][neuron, :])
        
        
    def fit(self):
        for epoch in range(0, self.epochs):
            for batch in self.batches:
                self.forward_propagation(batch[0])
                self.back_propagation(batch[1])
            self.accuracy(self.X_train, self.Y_train, "Train accuracy: ")
            self.accuracy(self.X_valid, self.Y_valid, "Valid accuracy: ")
    
    
    def accuracy(self, X, Y, msg):
        self.forward_propagation(X)
        real = np.argmax(Y, axis=0)
        predicted = np.argmax(self.a[-1], axis=0)
        nr = 0
        for i in range(0, real.size):
            if real[i] == predicted[i]:
                nr += 1
        print(msg + str(nr / X.shape[1]))
    
    
    def reshape_Y(self, vec):    
        new_Y = np.zeros((self.nr_labels, len(vec)))
        for i in range(0, len(vec)):
            new_Y[vec[i], i] = 1
        return new_Y
    
    
    def split_in_batches(self, X, Y, batch_size):
        batches = []
        for i in range(0, X.shape[1], batch_size):
            batches.append((X[:, i:i+batch_size], Y[:, i:i+batch_size]))
        return batches

    
if __name__ == "__main__":
    
    X_ANN = train_df[filter(lambda elem : elem != 'target', train_df.columns)].values
    Y_ANN = train_df['target'].values
    X_ANN_train, X_ANN_test, Y_ANN_train, Y_ANN_test = train_test_split(X_ANN, Y_ANN, train_size=0.80, test_size=0.20, shuffle=True, stratify=Y_ANN)
    X_ANN_train = X_ANN_train.T
    X_ANN_test = X_ANN_test.T

    var = ANN((X_ANN_train, Y_ANN_train), (X_ANN_test, Y_ANN_test), (X_ANN_test, Y_ANN_test), 
              batch_size=10, epochs=10, learning_rate=0.1, nr_labels=2, neurons_per_layer=[X_ANN_train.shape[0], 50, 50, 2], activation='relu',
              optimizer='adagrad', regularization='L2', lambd=0.01, dropout=None)
    var.fit()

In [44]:
X_gaussian_NB = train_df[['city_development_index', 'training_hours']]
Y_gaussian_NB = train_df['target']
gaussian_NB = GaussianNB()
scores = cross_validate(gaussian_NB, X_gaussian_NB, Y_gaussian_NB, scoring=['accuracy'], cv=StratifiedKFold(n_splits=10), return_estimator=True)
gaussian_NB = scores['estimator'][np.argmax(scores['test_accuracy'])]

In [45]:
gaussian_NB.fit(X_gaussian_NB, Y_gaussian_NB)
gaussian_NB_predictions = gaussian_NB.predict(X_gaussian_NB)

In [46]:
print("Balanced accuracy for Random Forest is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_gaussian_NB, gaussian_NB_predictions)))
print(classification_report(Y_gaussian_NB, gaussian_NB_predictions))

In [47]:
categorical_set = copy.deepcopy(copie_original)
filt = list(filter(lambda elem : elem not in ['city_development_index', 'training_hours', 'target', 'experience'], copie_original.columns))
categorical_set[filt] = OrdinalEncoder().fit_transform(categorical_set[filt])

X_categorical_NB = categorical_set[filt]
Y_categorical_NB = categorical_set['target']
Y_categorical_NB = Y_categorical_NB.astype('int')
categorical_NB = CategoricalNB()
scores = cross_validate(categorical_NB, X_categorical_NB, Y_categorical_NB, scoring=['accuracy'], cv=StratifiedKFold(n_splits=10), return_estimator=True)
categorical_NB = scores['estimator'][np.argmax(scores['test_accuracy'])]

In [48]:
categorical_NB.fit(X_categorical_NB, Y_categorical_NB)
categorical_NB_predictions = categorical_NB.predict(X_categorical_NB)

In [49]:
print("Balanced accuracy for Random Forest is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_categorical_NB, categorical_NB_predictions)))
print(classification_report(Y_categorical_NB, categorical_NB_predictions))

In [50]:
filt = list(filter(lambda elem : elem != 'target', train_df.columns))
logistic_regression = LogisticRegression(n_jobs=-1)
logistic_regression.fit(train_df[filt], train_df['target'])


X_logistic = train_df[filt]
Y_logistic = train_df['target']
logistic_regression = LogisticRegression(penalty='l2', n_jobs=-1)
scores = cross_validate(logistic_regression, X_logistic, Y_logistic, scoring=['accuracy'], cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=5), return_estimator=True, error_score='raise') 
logistic_regression = scores['estimator'][np.argmax(scores['test_accuracy'])]

logistic_regression.fit(X_logistic, Y_logistic)
logistic_regression_predictions = logistic_regression.predict(X_logistic)

In [51]:
print("Balanced accuracy for Logistic Regression is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_logistic, logistic_regression_predictions)))
print(classification_report(Y_logistic, logistic_regression_predictions))

In [52]:
from IPython.display import Image
Image(filename="../input/poze-explicatii/dt_questions.png", width= 700, height=500)

In [53]:
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680

DT_params = RandomizedSearchCV(estimator=DecisionTreeClassifier(class_weight='balanced'), param_distributions={'min_samples_split': randint(1, 40),
                                                                                                               'min_samples_leaf':  randint(1, 20)},
                               n_iter=10, n_jobs=-1, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=1))

filt = list(filter(lambda elem : elem != 'target', train_df.columns))            
dt_hyper_params = DT_params.fit(train_df[filt], train_df['target'])
pd.DataFrame(dt_hyper_params.cv_results_).loc[[dt_hyper_params.best_index_]]

In [54]:
train_df

In [55]:
fig = plt.figure(figsize = (20,20))
ax = fig.gca()
_ = sklearn.tree.plot_tree(dt_hyper_params.best_estimator_, ax=ax, fontsize=10)

In [56]:
best_dt = dt_hyper_params.best_estimator_
X_dt = train_df[filt]
Y_dt = train_df['target']
best_dt.fit(X_dt, Y_dt)
dt_predictions = best_dt.predict(X_dt)

In [57]:
print("Balanced accuracy for Decision Tree is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_dt, dt_predictions)))
print(classification_report(Y_dt, dt_predictions))

In [58]:
#max_features=sqrt recommended for classification
RF_params = RandomizedSearchCV(estimator=RandomForestClassifier(max_features='sqrt', class_weight='balanced', bootstrap=True, n_jobs=-1), 
                               param_distributions={'n_estimators': randint(100, 500), 'min_samples_split': randint(1, 40), 'min_samples_leaf':  randint(1, 20)},
                               n_iter=1, n_jobs=-1, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=5))

filt = list(filter(lambda elem : elem != 'target', train_df.columns))            
result_RF = RF_params.fit(train_df[filt], train_df['target'])
pd.DataFrame(result_RF.cv_results_).loc[[result_RF.best_index_]]

In [59]:
best_rf = RF_params.best_estimator_
X_rf = train_df[filt]
Y_rf = train_df['target']
best_rf.fit(X_rf, Y_rf)
rf_predictions = best_rf.predict(X_rf)

In [60]:
print("Balanced accuracy for Random Forest is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_rf, rf_predictions)))
print(classification_report(Y_rf, rf_predictions))

In [61]:
from sklearn.ensemble import AdaBoostClassifier

best_adaboost = AdaBoostClassifier(n_estimators=1000, )
X_ada = train_df[filt]
Y_ada = train_df['target']
best_adaboost.fit(X_ada, Y_ada)
ada_predictions = best_adaboost.predict(X_ada)

In [62]:
from IPython.display import Image
Image(filename="../input/poze-explicatii/adaboost.png", width= 700, height=500)

In [63]:
print("Balanced accuracy for Adaboost is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_ada, ada_predictions)))
print(classification_report(Y_ada, ada_predictions))

In [64]:
from IPython.display import Image
Image(filename="../input/poze-explicatii/svm_1.png", width= 700, height=500)

In [65]:
from IPython.display import Image
Image(filename="../input/poze-explicatii/svm_2.png", width= 700, height=500)

In [66]:
SVM_params = RandomizedSearchCV(estimator=SVC(coef0=1, probability=True, cache_size=1000, class_weight='balanced', decision_function_shape='ovo', random_state=42), 
                                param_distributions = {'C' : scipy.stats.expon(scale=100), 
                                                       'kernel': ['linear', 'poly', 'rbf'],
                                                       'degree': [2, 3],
                                                       'gamma': scipy.stats.expon(scale=.1)},
                                n_iter=1, n_jobs=-1, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=1))

filt = list(filter(lambda elem : elem != 'target', train_df.columns))            
svm_hyper_params = SVM_params.fit(train_df[filt].loc[:1000], train_df['target'].loc[:1000])
best_svm = svm_hyper_params.best_estimator_
pd.DataFrame(svm_hyper_params.cv_results_).loc[[svm_hyper_params.best_index_]]

In [67]:
best_svm = svm_hyper_params.best_estimator_
X_svm = train_df[filt]
Y_svm = train_df['target']
best_svm.fit(train_df[filt].loc[:1000], train_df['target'].loc[:1000])
svm_predictions = best_svm.predict(X_svm)

In [68]:
print("Balanced accuracy for Random Forest is: {bal_acc}".format(bal_acc=balanced_accuracy_score(Y_svm, svm_predictions)))
print(classification_report(Y_svm, svm_predictions))

In [69]:

print("------------------------------- ROC Curve --------------------------------------------------------------")
fpr, tpr, thresholds = roc_curve(Y_gaussian_NB, gaussian_NB.predict_proba(X_gaussian_NB)[:, 1])
fpr2, tpr2, thresholds2 = roc_curve(Y_categorical_NB, categorical_NB.predict_proba(X_categorical_NB)[:, 1])
fpr3, tpr3, thresholds3 = roc_curve(Y_logistic, logistic_regression.predict_proba(X_logistic)[:, 1])
fpr4, tpr4, thresholds4 = roc_curve(Y_dt, best_dt.predict_proba(X_dt)[:, 1])
fpr5, tpr5, thresholds5 = roc_curve(Y_rf, best_rf.predict_proba(X_rf)[:, 1])
fpr6, tpr6, tresholds6 = roc_curve(Y_svm, best_svm.predict_proba(X_svm)[:, 1])
fpr7, tpr7, tresholds7 = roc_curve(Y_ada, best_adaboost.predict_proba(X_ada)[:, 1])
plt.plot(fpr, tpr, label = "ROC curve")
plt.plot(fpr2, tpr2, label = "ROC curve")
plt.plot(fpr3, tpr3, label = "ROC curve")
plt.plot(fpr4, tpr4, label = "ROC curve")
plt.plot(fpr5, tpr5, label = "ROC curve")
plt.plot(fpr6, tpr6, label = "ROC curve")
plt.plot(fpr7, tpr7, label = "ROC curve")
plt.legend(["Gaussian Naive Bayes", "Categorical Naive Bayes", "Logistic Regression", "Decision Trees", "Random Forest", "SVM", "Adaboost"])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("1 - specificity")
plt.ylabel("Sensitivity")
plt.show()
# print("AUC for Gaussian Naive Bayes: %f" % (roc_auc_score(y_test_nb, gnb.predict_proba(x_test_nb)[:, 1])))
# print("AUC for Logistic Regression: %f" % (roc_auc_score(y_test_general, logreg.predict_proba(x_test_general)[:, 1])))
# print("AUC for Random Forest: %f" % (roc_auc_score(y_test_general, RF.predict_proba(x_test_general)[:, 1])))
# print("AUC for Decision Tree: %f" % (roc_auc_score(y_test_general, DT.predict_proba(x_test_general)[:, 1])))
# print("AUC for DT&SVM: %f" % (roc_auc_score(y_test_for_both, np.concatenate((y_pred_SVM_proba[:, 1], DT.predict_proba(x_test_for_DT)[:, 1])))))