In [1]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
import os
import datetime
ts = '{:%Y%m%d%H%M%S}'.format(datetime.datetime.now())

from IPython.display import Image
from time import sleep
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from pandas import DataFrame, Series
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from amb_sdk.sdk import DarwinSdk
from sklearn.metrics import r2_score

%matplotlib inline

# Feature Engineering

In [2]:
data = pd.read_csv("../Data/speed-dating_raw.csv")
x = ['gender', 'race', 'race_o', 'field']
columns = list(data)

# Deleting Bins
for column in columns:
    if column not in x and data[str(column)].dtype.name == 'object':
        del data[str(column)]

# Deleting useless columns        
del data['has_null']
del data['wave']
del data['d_age']
del data['samerace']
del data['expected_happy_with_sd_people']
del data['expected_num_interested_in_me']
del data['expected_num_matches']
del data['like']
del data['guess_prob_liked']
del data['decision']
del data['decision_o']

# Replace age NA with mean
mean = round(data['age'].mean())
data['age'].fillna(mean, inplace = True)
mean = round(data['age_o'].mean())
data['age_o'].fillna(mean, inplace = True)

# Make sure difference in age is correct
data['age_d'] = (data['age'] - data['age_o'])
data['age_d_abs'] = data['age_d'].abs()

# Replace race NA with other
data['race'].fillna('other', inplace= True)
data['race_o'].fillna('other', inplace = True)

# Verifying that same_race is correct with replaced race
data['same_race'] = (data['race'] == data['race_o'])

# Replace NA with 0 for preferences
preferences = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']          
for pref in preferences:
    data[pref].fillna(0, inplace = True)

# Renaming column names
data.rename(columns = {'importance_same_race':'same_race_i',
                       'importance_same_religion': 'same_religion_i',
                       'pref_o_attractive':'attractive_o_i',
                       'pref_o_sincere':'sincere_o_i',
                       'pref_o_intelligence':'intelligence_o_i',
                       'pref_o_funny':'funny_o_i',
                       'pref_o_ambitious':'ambitious_o_i',
                       'pref_o_shared_interests':'shared_interests_o_i',
                       'attractive_important':'attractive_i',
                       'sincere_important': 'sincere_i',
                       'intellicence_important': 'intelligence_i',
                       'funny_important':'funny_i',
                       'ambtition_important':'ambitious_i',
                       'shared_interests_important':'shared_interests_i',
                       'ambition':'ambitious',
                       'sinsere_o': 'sincere_o',
                       'ambitous_o':'ambitious_o',
                       'ambition_partner':'ambitious_partner'}, inplace = True)

# Making sure that importance columns add up to 100
data['o_i'] = data['attractive_o_i'] + data['sincere_o_i'] + data['intelligence_o_i'] + data['funny_o_i'] + data['ambitious_o_i'] + data['shared_interests_o_i']
data['attractive_o_i'] = (data['attractive_o_i'] / data['o_i'])
data['sincere_o_i'] = (data['sincere_o_i'] / data['o_i'])
data['intelligence_o_i'] = (data['intelligence_o_i'] / data['o_i'])
data['funny_o_i'] = (data['funny_o_i'] / data['o_i'])
data['ambitious_o_i'] = (data['ambitious_o_i'] / data['o_i'])
data['shared_interests_o_i'] = (data['shared_interests_o_i'] / data['o_i'])

data['i'] = data['attractive_i'] + data['sincere_i'] + data['intelligence_i'] + data['funny_i'] + data['ambitious_i'] + data['shared_interests_i']
data['attractive_i'] = (data['attractive_i'] / data['i'])
data['sincere_i'] = (data['sincere_i'] / data['i'])
data['intelligence_i'] = (data['intelligence_i'] / data['i'])
data['funny_i'] = (data['funny_i'] / data['i'])
data['ambitious_i'] = (data['ambitious_i'] / data['i'])
data['shared_interests_i'] = (data['shared_interests_i'] / data['i'])

del data['o_i']
del data['i']

# Filling in data that are empty
temp = ['attractive_o_i', 'sincere_o_i', 'intelligence_o_i', 'funny_o_i', 'ambitious_o_i', 'shared_interests_o_i', 'attractive_i', 'sincere_i', 'intelligence_i', 'funny_i', 'ambitious_i', 'shared_interests_i']          
for t in temp:
    data[t].fillna((1.0 / 6.0), inplace = True)

# Replacing same_race_i & same_religion_i with mean
mean = data['same_race_i'].mean()
data['same_race_i'].fillna(round(mean), inplace = True)

mean = data['same_religion_i'].mean()
data['same_religion_i'].fillna(round(mean), inplace = True)

# One Hot Encoding
data = pd.concat([data, pd.get_dummies(data['gender'], prefix = 'gender')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['race'], prefix = 'race')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['race_o'], prefix = 'race_o')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['field'], prefix = 'field')], axis = 1)

del data['gender']
del data['race']
del data['race_o']
del data['field']

# Label Encoding
le = preprocessing.LabelEncoder()
data['same_race'] = le.fit_transform(data['same_race'])

# Fill NA's with mean
mean = data['attractive_o'].mean()
data['attractive_o'].fillna(round(mean), inplace = True)
mean = data['sincere_o'].mean()
data['sincere_o'].fillna(round(mean), inplace = True)
mean = data['intelligence_o'].mean()
data['intelligence_o'].fillna(round(mean), inplace = True)
mean = data['funny_o'].mean()
data['funny_o'].fillna(round(mean), inplace = True)
mean = data['ambitious_o'].mean()
data['ambitious_o'].fillna(round(mean), inplace = True)
mean = data['shared_interests_o'].mean()
data['shared_interests_o'].fillna(round(mean), inplace = True)
mean = data['attractive'].mean()
data['attractive'].fillna(round(mean), inplace = True)
mean = data['sincere'].mean()
data['sincere'].fillna(round(mean), inplace = True)
mean = data['intelligence'].mean()
data['intelligence'].fillna(round(mean), inplace = True)
mean = data['funny'].mean()
data['funny'].fillna(round(mean), inplace = True)
mean = data['ambitious'].mean()
data['ambitious'].fillna(round(mean), inplace = True)
mean = data['attractive_partner'].mean()
data['attractive_partner'].fillna(round(mean), inplace = True)
mean = data['sincere_partner'].mean()
data['sincere_partner'].fillna(round(mean), inplace = True)
mean = data['intelligence_partner'].mean()
data['intelligence_partner'].fillna(round(mean), inplace = True)
mean = data['funny_partner'].mean()
data['funny_partner'].fillna(round(mean), inplace = True)
mean = data['ambitious_partner'].mean()
data['ambitious_partner'].fillna(round(mean), inplace = True)
mean = data['shared_interests_partner'].mean()
data['shared_interests_partner'].fillna(round(mean), inplace = True)
mean = data['met'].mean()
data['met'].fillna(round(mean), inplace = True)

# Delete rows with NA's for interests correlate
data = data.dropna(axis = 0, subset = ['interests_correlate'])

print(data.shape)
data.head()

(8220, 287)


Unnamed: 0,age,age_o,same_race_i,same_religion_i,attractive_o_i,sincere_o_i,intelligence_o_i,funny_o_i,ambitious_o_i,shared_interests_o_i,...,field_tc [health ed],field_teaching of english,field_tesol,field_theater,field_theatre management & producing,field_theory,field_undergrad - gs,field_urban planning,field_working,field_writing: literary nonfiction
0,21.0,27.0,2.0,4.0,0.35,0.2,0.2,0.2,0.0,0.05,...,0,0,0,0,0,0,0,0,0,0
1,21.0,22.0,2.0,4.0,0.6,0.0,0.0,0.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,21.0,22.0,2.0,4.0,0.19,0.18,0.19,0.18,0.14,0.12,...,0,0,0,0,0,0,0,0,0,0
3,21.0,23.0,2.0,4.0,0.3,0.05,0.15,0.4,0.05,0.05,...,0,0,0,0,0,0,0,0,0,0
4,21.0,24.0,2.0,4.0,0.3,0.1,0.2,0.1,0.1,0.2,...,0,0,0,0,0,0,0,0,0,0


# SMOTE

In [3]:
data_copy = data
data_Y = data_copy['match']
data_X = data_copy.drop(['match'], axis = 1)

sm = SMOTE(sampling_strategy = 'minority')
smote_X, smote_Y = sm.fit_sample(data_X, data_Y)

print(smote_X.shape)
print(smote_Y.shape)
smote_X = DataFrame(smote_X, columns = data_X.columns)
smote_Y = Series(smote_Y, name = 'match')
data = pd.concat([smote_X, smote_Y], axis = 1)
data.head()

data.to_csv('Speed_Dating_Clean.csv', index = False)

(13740, 286)
(13740,)


# Features and Label

In [4]:
data_Y = data['match']
data_X = data.drop(['match'], axis = 1)
print(data_X.shape)
data_X.head(5)

(13740, 286)


Unnamed: 0,age,age_o,same_race_i,same_religion_i,attractive_o_i,sincere_o_i,intelligence_o_i,funny_o_i,ambitious_o_i,shared_interests_o_i,...,field_tc [health ed],field_teaching of english,field_tesol,field_theater,field_theatre management & producing,field_theory,field_undergrad - gs,field_urban planning,field_working,field_writing: literary nonfiction
0,21.0,27.0,2.0,4.0,0.35,0.2,0.2,0.2,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21.0,22.0,2.0,4.0,0.6,0.0,0.0,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21.0,22.0,2.0,4.0,0.19,0.18,0.19,0.18,0.14,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21.0,23.0,2.0,4.0,0.3,0.05,0.15,0.4,0.05,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21.0,24.0,2.0,4.0,0.3,0.1,0.2,0.1,0.1,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preprocessing

In [None]:
scaled_X = StandardScaler().fit_transform(data_X)

pca = PCA(0.95)
pca_X = pca.fit_transform(scaled_X)

In [None]:
print(pca_X.shape)
pd.DataFrame(pca_X).head(5)

(13740, 234)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,224,225,226,227,228,229,230,231,232,233
0,0.08014,0.448233,0.494691,-3.7614,-2.814237,0.804044,0.465357,-0.215916,-0.399509,0.365404,...,0.156988,-1.178416,0.005001,-0.448598,-0.430335,-0.65265,0.485292,-0.127141,0.810934,0.453341
1,-0.746024,-0.216072,-0.312979,-4.650605,-3.186895,0.909309,1.370074,1.059597,-1.042069,1.121485,...,-0.845577,-0.939525,-1.791157,1.782628,-0.37227,0.224847,1.525,1.317544,1.983727,0.684671
2,0.442522,-2.703964,-0.922161,-2.934378,-4.239209,-0.386319,-0.190687,-0.422419,1.533997,-0.491614,...,0.661426,0.11733,-0.837959,1.133739,0.845606,0.875591,-0.427985,0.18736,0.547344,0.827998
3,-0.197814,-0.749901,-0.313529,-3.631189,-3.203978,0.489273,0.799443,0.430955,-0.177497,0.925532,...,-0.024812,-1.713644,-1.630116,0.870336,-0.725677,-1.004685,1.896075,-0.385753,0.326999,0.996398
4,0.740412,0.456389,0.778752,-2.480459,-3.354329,0.196792,-0.227965,-0.11645,0.130112,-0.818731,...,-1.362877,-1.045173,-0.64473,1.051525,-1.471961,0.192777,0.323514,-0.197241,0.40691,1.140018


# Decision Tree

In [None]:
clf = tree.DecisionTreeClassifier()

# Desired Parameters
params = {"criterion" : ['gini', 'entropy'],
          "max_depth": [5, 10, 15, 20],
          "max_features": ['sqrt', 'log2'],
          "min_samples_leaf": [5, 10, 15, 20]
         }

# Test Parameters
grid_search = GridSearchCV(clf, params, cv = 10, scoring = 'accuracy')
grid_search.fit(pca_X, data_Y)

# Print Results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

In [None]:
print(classification_report(data_Y, grid_search.predict(pca_X)))

# Random Forest

In [None]:
clf = RandomForestClassifier()

# Desired Parameters
params = {"criterion" : ['gini', 'entropy'],
          "max_depth": [5, 10, 15, 20],
          "max_features": ['sqrt', 'log2'],
          "min_samples_leaf": [5, 10, 15, 20]
         }

# Test Parameters
grid_search = GridSearchCV(clf, params, cv = 10, scoring = 'accuracy')
grid_search.fit(pca_X, data_Y)

# Print Results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

In [None]:
print(classification_report(data_Y, grid_search.predict(pca_X)))

# SVM

In [None]:
scaler = StandardScaler()

# Creating svc. Here we use the default instead of linear as mentioned on the piazza post. 
svc_clf = SVC()

# Creating the pipeline
pipe = Pipeline(steps = [('sca', scaler ), ('svc', svc_clf)])

# Pass the pipeline in to a cross_val_score 
scores = cross_val_score(pipe, data_X, data_Y)

# Printing the average accuracy
print('Average Accuracy:', scores.mean() * 100)

# Naive Bayes

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size = 0.20)

# Create NB clf and fit it
clf_rf = GaussianNB()
clf_rf.fit(train_X, train_Y)
pred_Y = clf_rf.predict(test_X)

# Predict the accuracy with one model
print("Accuracy is ", accuracy_score(test_Y, pred_Y))

# Cross validation
scores = cross_val_score(clf_rf, data_X, data_Y, cv = 10)                                         
print("Accuracy with 10-fold cross validation:", scores.mean() * 100)

print('The confusion matrix is:\n', confusion_matrix(test_Y, clf_rf.predict(test_X)))
print(classification_report(test_Y, clf_rf.predict(test_X)))

# Neural Networks

In [None]:
scaler = StandardScaler()
clf = MLPClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

param_grid = {
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
}

grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

nested_score = cross_val_score(grid_search, data_X, data_Y, cv = 5)
print("Accuracy:", nested_score.mean() * 100)

# Darwin

In [None]:
ds = DarwinSdk()
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('aifazg92@gmail.com', 'UcLUQHr5N7')

In [None]:
dataset_name = 'Speed_Dating_Clean.csv'

In [None]:
ds.lookup_dataset()
# ds.delete_dataset("Speed_Dating_Clean.csv")

In [None]:
# Upload dataset to Darwin
status, dataset = ds.upload_dataset("Speed_Dating_Clean.csv")

In [None]:
# Clean data on Darwin
target = "match"
status, job_id = ds.clean_data(dataset_name, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# Create Darwin model
model = target + "_model" + ts
status, job_id = ds.create_model(dataset_names = dataset_name,
                                 model_name = model,
                                 max_train_time = '00:05')

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

# Best model decided by Darwin

In [None]:
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

In [None]:
status, artifact = ds.run_model(dataset_name, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("/n", prediction.head())

plt.plot(data[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('/nR^2 : ', r2_score(data[target], prediction[target]))

unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("/n", classification_report(data[target], prediction[target]))

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type['description']['best_genome'])

# DeepNeuralNetwork

In [None]:
status, artifact = ds.analyze_model(model, model_type = 'DeepNeuralNetwork')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'DeepNeuralNetwork')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("/n", prediction.head())

plt.plot(data[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('/nR^2 : ', r2_score(data[target], prediction[target]))

unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("/n", classification_report(data[target], prediction[target]))

# RandomForest

In [None]:
status, artifact = ds.analyze_model(model, model_type = 'RandomForest')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'RandomForest')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("/n", prediction.head())

plt.plot(data[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('/nR^2 : ', r2_score(data[target], prediction[target]))

unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("/n", classification_report(data[target], prediction[target]))

# GradientBoosted

In [None]:
status, artifact = ds.analyze_model(model, model_type = 'GradientBoosted')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'GradientBoosted')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("/n", prediction.head())

plt.plot(data[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('/nR^2 : ', r2_score(data[target], prediction[target]))

unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("/n", classification_report(data[target], prediction[target]))