In [None]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

from pandas import DataFrame, Series
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE

%matplotlib inline

In [None]:
data = pd.read_csv("../Data/Speed_Dating_Clean.csv")
print(data.shape)
data.head(10)

In [None]:
data_Y = data['match']
data_X = data.drop(['match', 'decision', 'decision_o', 'Unnamed: 0'], axis = 1)
labels = data_Y
print(data_X.shape)
data_X.head(10)

In [None]:
def downsample_data(data):
    downData = None
    downLabels = None

    kfold = StratifiedKFold(n_splits = 100, shuffle = True)
    for throwAway_index, subsample_index in kfold.split(data, labels):
        downData = data.iloc[subsample_index]
        downLabels = labels.iloc[subsample_index]
        break

    print(downData.shape)
    return downData

In [None]:
data_X1 = data_X.copy()
data_X1['match'] = data_Y

downData = downsample_data(data_X1)
data_Y1 = downData['match']
data_X1 = downData.drop(['match'], axis = 1)

print(data_X1.shape)
data_X1.head(10)

In [None]:
data_X2 = data_X.copy()
data_X2['match'] = data_Y

no_match = len(data_X2[data_X2['match'] == 0])
no_match_indices = data_X2[data_X2.match == 0].index
random_indices = np.random.choice(no_match_indices, no_match, replace = False)

match_indices = data_X2[data_X2.match == 1].index
under_sample_indices = np.concatenate([match_indices, random_indices])
under_sample = data_X2.loc[under_sample_indices]

under_X = under_sample.loc[:, under_sample.columns != 'match']
under_Y = under_sample.loc[:, under_sample.columns == 'match']

In [None]:
print(under_X.shape)
under_X.head(10)

In [None]:
sm = SMOTE(random_state = 12, ratio = 1.0)
smote_X, smote_Y = sm.fit_sample(data_X, data_Y)

In [None]:
print(smote_X.shape)
smote_X = DataFrame(smote_X)
smote_X.head(10)

# Decision Trees

In [None]:
# Decision Trees
scaler = StandardScaler()
pca = PCA(0.95)
clf = tree.DecisionTreeClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': list(range(5, 60)),
    'clf__max_features': ['sqrt', 'log2'],
    'clf__min_samples_leaf': list(range(5, 60, 5))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Decision Trees - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees
scaler = StandardScaler()
pca = PCA()
clf = tree.DecisionTreeClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'pca__n_components': list(range(1, 63)),
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': list(range(5, 60)),
    'clf__max_features': ['sqrt', 'log2'],
    'clf__min_samples_leaf': list(range(5, 60, 5))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Decision Trees - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Decision Trees - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

# Naive Bayes

In [None]:
# Naive Bayes - Original
clf = GaussianNB()
scores = cross_val_score(clf, data_X, data_Y, cv = 10)
print("Accuracy:", scores.mean() * 100)


pred_Y = cross_val_predict(clf, data_X, data_Y, cv = 10)
print(metrics.confusion_matrix(data_Y, pred_Y))

print('\n', metrics.classification_report(data_Y, pred_Y))


train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size = 0.20)

clf.fit(train_X, train_Y)

preds = clf.predict_proba(test_X)

fpr, tpr, thresholds = metrics.roc_curve(test_Y, preds[:, 1], pos_label = 1)

print(metrics.roc_auc_score(test_Y, preds[:, 1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()

In [None]:
# Naive Bayes - Downsample
clf = GaussianNB()
scores = cross_val_score(clf, data_X1, data_Y1, cv = 10)
print("Accuracy:", scores.mean() * 100)


pred_Y = cross_val_predict(clf, data_X1, data_Y1, cv = 10)
print(metrics.confusion_matrix(data_Y1, pred_Y))

print('\n', metrics.classification_report(data_Y1, pred_Y))


train_X, test_X, train_Y, test_Y = train_test_split(data_X1, data_Y1, test_size = 0.20)

clf.fit(train_X, train_Y)

preds = clf.predict_proba(test_X)

fpr, tpr, thresholds = metrics.roc_curve(test_Y, preds[:, 1], pos_label = 1)

print(metrics.roc_auc_score(test_Y, preds[:, 1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()

In [None]:
# Naive Bayes - Undersampling
clf = GaussianNB()
scores = cross_val_score(clf, under_X, under_Y, cv = 10)
print("Accuracy:", scores.mean() * 100)


pred_Y = cross_val_predict(clf, under_X, under_Y, cv = 10)
print(metrics.confusion_matrix(under_Y, pred_Y))

print('\n', metrics.classification_report(under_Y, pred_Y))


train_X, test_X, train_Y, test_Y = train_test_split(under_X, under_Y, test_size = 0.20)

clf.fit(train_X, train_Y)

preds = clf.predict_proba(test_X)

fpr, tpr, thresholds = metrics.roc_curve(test_Y, preds[:, 1], pos_label = 1)

print(metrics.roc_auc_score(test_Y, preds[:, 1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()

In [None]:
# Naive Bayes - SMOTE
clf = GaussianNB()
scores = cross_val_score(clf, smote_X, smote_Y, cv = 10)
print("Accuracy:", scores.mean() * 100)


pred_Y = cross_val_predict(clf, smote_X, smote_Y, cv = 10)
print(metrics.confusion_matrix(smote_Y, pred_Y))

print('\n', metrics.classification_report(smote_Y, pred_Y))


train_X, test_X, train_Y, test_Y = train_test_split(smote_X, smote_Y, test_size = 0.20)

clf.fit(train_X, train_Y)

preds = clf.predict_proba(test_X)

fpr, tpr, thresholds = metrics.roc_curve(test_Y, preds[:, 1], pos_label = 1)

print(metrics.roc_auc_score(test_Y, preds[:, 1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()

# K Nearest Neighbors

In [None]:
# K Nearest Neighbors
scaler = MinMaxScaler()
pca = PCA(0.95)
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('knn', knn)])

param_grid = {
    'knn__n_neighbors': list(range(1, 50))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# K Nearest Neighbors - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors
scaler = MinMaxScaler()
pca = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('knn', knn)])

param_grid = {
    'pca__n_components': list(range(1, 63)),
    'knn__n_neighbors': list(range(1, 50))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# K Nearest Neighbors - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# K Nearest Neighbors - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

# SVM

In [None]:
# SVM
scaler = StandardScaler()
clf = SVC()
pipe = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

param_grid = {
    'clf__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'clf__C': list(range(10, 200, 10))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# SVM - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# SVM - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# SVM - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# SVM - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

# Ensembles

In [None]:
# Ensembles - Random Forests
scaler = StandardScaler()
pca = PCA(0.95)
clf = RandomForestClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': list(range(5, 60)),
    'clf__max_features': ['sqrt', 'log2'],
    'clf__min_samples_leaf': list(range(5, 60, 5))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Ensembles - Random Forests - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests
scaler = StandardScaler()
pca = PCA()
clf = RandomForestClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'pca__n_components': list(range(1, 63)),
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': list(range(5, 60)),
    'clf__max_features': ['sqrt', 'log2'],
    'clf__min_samples_leaf': list(range(5, 60, 5))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Ensembles - Random Forests - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - Random Forests - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost
scaler = StandardScaler()
pca = PCA(0.95)
clf = AdaBoostClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'clf__n_estimators': list(range(50, 500, 25))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Ensembles - AdaBoost - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost
scaler = StandardScaler()
pca = PCA()
clf = AdaBoostClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'pca__n_components': list(range(1, 63)),
    'clf__n_estimators': list(range(50, 500, 25))
}

#grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 3)
grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

In [None]:
# Ensembles - AdaBoost - Original
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - Downsample
grid_search.fit(data_X1, data_Y1)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - Undersampling
grid_search.fit(under_X, under_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, under_X, under_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

In [None]:
# Ensembles - AdaBoost - SMOTE
grid_search.fit(smote_X, smote_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

#nested_score = cross_val_score(grid_search, smote_X, smote_Y, cv = 5)
#print("Accuracy:", nested_score.mean() * 100)

# Unsupervised Learning

In [None]:
# Unsupervised Learning (Optional to compare)
# Clustering with DBSCAN, Clustering with K-means
# Hierarchical Clustering, Anomaly Detection