In [None]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

from pandas import DataFrame, Series
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE

%matplotlib inline

In [None]:
data = pd.read_csv("../Data/Speed_Dating_Clean.csv")
print(data.shape)
data.head(10)

In [None]:
data_Y = data['match']
data_X = data.drop(['match', 'decision', 'decision_o', 'Unnamed: 0'], axis = 1)
labels = data_Y
print(data_X.shape)
data_X.head(10)

In [None]:
def downsample_data(data):
    downData = None
    downLabels = None

    kfold = StratifiedKFold(n_splits = 100, shuffle = True)
    for throwAway_index, subsample_index in kfold.split(data, labels):
        downData = data.iloc[subsample_index]
        downLabels = labels.iloc[subsample_index]
        break

    print(downData.shape)
    return downData

In [None]:
data_X1 = data_X.copy()
data_X1['match'] = data_Y

downData = downsample_data(data_X1)
data_Y1 = downData['match']
data_X1 = downData.drop(['match'], axis = 1)

print(data_X1.shape)
data_X1.head(10)

In [None]:
data_X2 = data_X.copy()
data_X2['match'] = data_Y

no_match = len(data_X2[data_X2['match'] == 0])
no_match_indices = data_X2[data_X2.match == 0].index
random_indices = np.random.choice(no_match_indices, no_match, replace = False)

match_indices = data_X2[data_X2.match == 1].index
under_sample_indices = np.concatenate([match_indices, random_indices])
under_sample = data_X2.loc[under_sample_indices]

under_X = under_sample.loc[:, under_sample.columns != 'match']
under_Y = under_sample.loc[:, under_sample.columns == 'match']

In [None]:
print(under_X.shape)
under_X.head(10)

In [None]:
sm = SMOTE(random_state = 12, ratio = 1.0)
smote_X, smote_Y = sm.fit_sample(data_X, data_Y)

In [None]:
print(smote_X.shape)
smote_X = DataFrame(smote_X)
smote_X.head(10)

In [None]:
# Neural Networks: w/o PCA
scaler = StandardScaler()
clf = MLPClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

param_grid = {
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Neural Networks - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks: w/o PCA param and var of 0.95
scaler = StandardScaler()
pca = PCA(0.95)
clf = MLPClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Neural Networks - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks
scaler = StandardScaler()
pca = PCA()
clf = MLPClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'pca__n_components': list(range(1, 63)),
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Neural Networks - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Neural Networks - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)