In [None]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
import os
import datetime
ts = '{:%Y%m%d%H%M%S}'.format(datetime.datetime.now())

from IPython.display import Image
from time import sleep
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from pandas import DataFrame, Series
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from amb_sdk.sdk import DarwinSdk
from sklearn.metrics import r2_score

%matplotlib inline

# Cleaning Data

In [None]:
data = pd.read_csv("speed-dating_raw.csv")
x = ['gender', 'race', 'race_o', 'field']
columns = list(data)

# Deleting bins
for column in columns:
    if column not in x and data[str(column)].dtype.name == 'object':
        del data[str(column)]

# Deleting useless columns        
del data['has_null']
del data['wave']
del data['d_age']
del data['samerace']
del data['expected_happy_with_sd_people']
del data['expected_num_interested_in_me']
del data['expected_num_matches']
del data['like']
del data['guess_prob_liked']
del data['decision']
del data['decision_o']

# Replace age NA with mean
mean = round(data['age'].mean())
data['age'].fillna(mean, inplace = True)
mean = round(data['age_o'].mean())
data['age_o'].fillna(mean, inplace = True)

# Make sure difference in age is correct
data['age_d'] = (data['age'] - data['age_o'])
data['age_d_abs'] = data['age_d'].abs()

# Replace race NA with other
data['race'].fillna('other', inplace= True)
data['race_o'].fillna('other', inplace = True)

# Verifying that same_race is correct with replaced race
data['same_race'] = (data['race'] == data['race_o'])

# Replace NA with 0 for preferences
preferences = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']          
for pref in preferences:
    data[pref].fillna(0, inplace = True)

# Renaming column names
data.rename(columns = {'importance_same_race':'same_race_i',
                       'importance_same_religion': 'same_religion_i',
                       'pref_o_attractive':'attractive_o_i',
                       'pref_o_sincere':'sincere_o_i',
                       'pref_o_intelligence':'intelligence_o_i',
                       'pref_o_funny':'funny_o_i',
                       'pref_o_ambitious':'ambitious_o_i',
                       'pref_o_shared_interests':'shared_interests_o_i',
                       'attractive_important':'attractive_i',
                       'sincere_important': 'sincere_i',
                       'intellicence_important': 'intelligence_i',
                       'funny_important':'funny_i',
                       'ambtition_important':'ambitious_i',
                       'shared_interests_important':'shared_interests_i',
                       'ambition':'ambitious',
                       'sinsere_o': 'sincere_o',
                       'ambitous_o':'ambitious_o',
                       'ambition_partner':'ambitious_partner'}, inplace = True)

# Making sure that opposite's importance columns add up to 100
data['o_i'] = data['attractive_o_i'] + data['sincere_o_i'] + data['intelligence_o_i'] + data['funny_o_i'] + data['ambitious_o_i'] + data['shared_interests_o_i']
data['attractive_o_i'] = (data['attractive_o_i'] / data['o_i'])
data['sincere_o_i'] = (data['sincere_o_i'] / data['o_i'])
data['intelligence_o_i'] = (data['intelligence_o_i'] / data['o_i'])
data['funny_o_i'] = (data['funny_o_i'] / data['o_i'])
data['ambitious_o_i'] = (data['ambitious_o_i'] / data['o_i'])
data['shared_interests_o_i'] = (data['shared_interests_o_i'] / data['o_i'])

# Making sure that my importance columns add up to 100
data['i'] = data['attractive_i'] + data['sincere_i'] + data['intelligence_i'] + data['funny_i'] + data['ambitious_i'] + data['shared_interests_i']
data['attractive_i'] = (data['attractive_i'] / data['i'])
data['sincere_i'] = (data['sincere_i'] / data['i'])
data['intelligence_i'] = (data['intelligence_i'] / data['i'])
data['funny_i'] = (data['funny_i'] / data['i'])
data['ambitious_i'] = (data['ambitious_i'] / data['i'])
data['shared_interests_i'] = (data['shared_interests_i'] / data['i'])

del data['o_i']
del data['i']

# Filling in data that are empty
temp = ['attractive_o_i', 'sincere_o_i', 'intelligence_o_i', 'funny_o_i', 'ambitious_o_i', 'shared_interests_o_i', 'attractive_i', 'sincere_i', 'intelligence_i', 'funny_i', 'ambitious_i', 'shared_interests_i']          
for t in temp:
    data[t].fillna((1.0 / 6.0), inplace = True)

# Replacing same_race_i & same_religion_i with mean (importance)
mean = data['same_race_i'].mean()
data['same_race_i'].fillna(round(mean), inplace = True)

mean = data['same_religion_i'].mean()
data['same_religion_i'].fillna(round(mean), inplace = True)

# One Hot Encoding of categorical data
data = pd.concat([data, pd.get_dummies(data['gender'], prefix = 'gender')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['race'], prefix = 'race')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['race_o'], prefix = 'race_o')], axis = 1)
data = pd.concat([data, pd.get_dummies(data['field'], prefix = 'field')], axis = 1)

del data['gender']
del data['race']
del data['race_o']
del data['field']

# Label Encoding
le = LabelEncoder()
data['same_race'] = le.fit_transform(data['same_race'])

# Fill NA's with mean
mean = data['attractive_o'].mean()
data['attractive_o'].fillna(round(mean), inplace = True)
mean = data['sincere_o'].mean()
data['sincere_o'].fillna(round(mean), inplace = True)
mean = data['intelligence_o'].mean()
data['intelligence_o'].fillna(round(mean), inplace = True)
mean = data['funny_o'].mean()
data['funny_o'].fillna(round(mean), inplace = True)
mean = data['ambitious_o'].mean()
data['ambitious_o'].fillna(round(mean), inplace = True)
mean = data['shared_interests_o'].mean()
data['shared_interests_o'].fillna(round(mean), inplace = True)
mean = data['attractive'].mean()
data['attractive'].fillna(round(mean), inplace = True)
mean = data['sincere'].mean()
data['sincere'].fillna(round(mean), inplace = True)
mean = data['intelligence'].mean()
data['intelligence'].fillna(round(mean), inplace = True)
mean = data['funny'].mean()
data['funny'].fillna(round(mean), inplace = True)
mean = data['ambitious'].mean()
data['ambitious'].fillna(round(mean), inplace = True)
mean = data['attractive_partner'].mean()
data['attractive_partner'].fillna(round(mean), inplace = True)
mean = data['sincere_partner'].mean()
data['sincere_partner'].fillna(round(mean), inplace = True)
mean = data['intelligence_partner'].mean()
data['intelligence_partner'].fillna(round(mean), inplace = True)
mean = data['funny_partner'].mean()
data['funny_partner'].fillna(round(mean), inplace = True)
mean = data['ambitious_partner'].mean()
data['ambitious_partner'].fillna(round(mean), inplace = True)
mean = data['shared_interests_partner'].mean()
data['shared_interests_partner'].fillna(round(mean), inplace = True)
mean = data['met'].mean()
data['met'].fillna(round(mean), inplace = True)

# Delete rows with NA's for interests correlate
data = data.dropna(axis = 0, subset = ['interests_correlate'])

print(data.shape)
data.head()

# Feature Engineering

In [None]:
# Difference between opposite's and my importance
data['attractive_i_d'] = (data['attractive_i'] - data['attractive_o_i'])
data['sincere_i_d'] = (data['sincere_i'] - data['sincere_o_i'])
data['intelligence_i_d'] = (data['intelligence_i'] - data['intelligence_o_i'])
data['funny_i_d'] = (data['funny_i'] - data['funny_o_i'])
data['ambitious_i_d'] = (data['ambitious_i'] - data['ambitious_o_i'])
data['shared_interests_i_d'] = (data['shared_interests_i'] - data['shared_interests_o_i'])

# Absolute difference of importance
data['attractive_i_d_abs'] = data['attractive_i_d'].abs()
data['sincere_i_d_abs'] = data['sincere_i_d'].abs()
data['intelligence_i_d_abs'] = data['intelligence_i_d'].abs()
data['funny_i_d_abs'] = data['funny_i_d'].abs()
data['ambitious_i_d_abs'] = data['ambitious_i_d'].abs()
data['shared_interests_i_d_abs'] = data['shared_interests_i_d'].abs()

# Difference between opposite's and my rating of me
data['attractive_d'] = (data['attractive'] - data['attractive_o'])
data['sincere_d'] = (data['sincere'] - data['sincere_o'])
data['intelligence_d'] = (data['intelligence'] - data['intelligence_o'])
data['funny_d'] = (data['funny'] - data['funny_o'])
data['ambitious_d'] = (data['ambitious'] - data['ambitious_o'])
data['shared_interests_d'] = (data['shared_interests_partner'] - data['shared_interests_o'])

# Absolute difference of rating
data['attractive_d_abs'] = data['attractive_d'].abs()
data['sincere_d_abs'] = data['sincere_d'].abs()
data['intelligence_d_abs'] = data['intelligence_d'].abs()
data['funny_d_abs'] = data['funny_d'].abs()
data['ambitious_d_abs'] = data['ambitious_d'].abs()
data['shared_interests_d_abs'] = data['shared_interests_d'].abs()

# Changing from [1-10] scale to percentage for opposite's rating
data['o'] = data['attractive_o'] + data['sincere_o'] + data['intelligence_o'] + data['funny_o'] + data['ambitious_o'] + data['shared_interests_o']
data['attractive_o_n'] = (data['attractive_o'] / data['o'])
data['sincere_o_n'] = (data['sincere_o'] / data['o'])
data['intelligence_o_n'] = (data['intelligence_o'] / data['o'])
data['funny_o_n'] = (data['funny_o'] / data['o'])
data['ambitious_o_n'] = (data['ambitious_o'] / data['o'])
data['shared_interests_o_n'] = (data['shared_interests_o'] / data['o'])

# Changing from [1-10] scale to percentage for my rating of opposite
data['p'] = data['attractive_partner'] + data['sincere_partner'] + data['intelligence_partner'] + data['funny_partner'] + data['ambitious_partner'] + data['shared_interests_partner']
data['attractive_p_n'] = (data['attractive_partner'] / data['p'])
data['sincere_p_n'] = (data['sincere_partner'] / data['p'])
data['intelligence_p_n'] = (data['intelligence_partner'] / data['p'])
data['funny_p_n'] = (data['funny_partner'] / data['p'])
data['ambitious_p_n'] = (data['ambitious_partner'] / data['p'])
data['shared_interests_p_n'] = (data['shared_interests_partner'] / data['p'])

del data['o']
del data['p']

# Filling in blanks with 0
preferences = ['attractive_o_n', 'sincere_o_n', 'intelligence_o_n', 'funny_o_n', 'ambitious_o_n', 'shared_interests_o_n', 'attractive_p_n', 'sincere_p_n', 'intelligence_p_n', 'funny_p_n', 'ambitious_p_n', 'shared_interests_p_n']
for pref in preferences:
    data[pref].fillna(0, inplace = True)

# Difference of rating percentage
data['d'] = data['attractive_d_abs'] + data['sincere_d_abs'] + data['intelligence_d_abs'] + data['funny_d_abs'] + data['ambitious_d_abs'] + data['shared_interests_d_abs']
data['attractive_d_n'] = (data['attractive_d'] / data['d'])
data['sincere_d_n'] = (data['sincere_d'] / data['d'])
data['intelligence_d_n'] = (data['intelligence_d'] / data['d'])
data['funny_d_n'] = (data['funny_d'] / data['d'])
data['ambitious_d_n'] = (data['ambitious_d'] / data['d'])
data['shared_interests_d_n'] = (data['shared_interests_d'] / data['d'])

del data['d']

# Absolute difference of rating percentage
data['attractive_d_n_abs'] = data['attractive_d_n'].abs()
data['sincere_d_n_abs'] = data['sincere_d_n'].abs()
data['intelligence_d_n_abs'] = data['intelligence_d_n'].abs()
data['funny_d_n_abs'] = data['funny_d_n'].abs()
data['ambitious_d_n_abs'] = data['ambitious_d_n'].abs()
data['shared_interests_d_n_abs'] = data['shared_interests_d_n'].abs()

# Difference between opposite's importance and their rating of me
data['attractive_oi_o_d_n'] = (data['attractive_o_i'] - data['attractive_o_n'])
data['sincere_oi_o_d_n'] = (data['sincere_o_i'] - data['sincere_o_n'])
data['intelligence_oi_o_d_n'] = (data['intelligence_o_i'] - data['intelligence_o_n'])
data['funny_oi_o_d_n'] = (data['funny_o_i'] - data['funny_o_n'])
data['ambitious_oi_o_d_n'] = (data['ambitious_o_i'] - data['ambitious_o_n'])
data['shared_interests_oi_o_d_n'] = (data['shared_interests_o_i'] - data['shared_interests_o_n'])

# Absolute difference of opposite's importance and their rating of me
data['attractive_oi_o_d_n_abs'] = data['attractive_oi_o_d_n'].abs()
data['sincere_oi_o_d_n_abs'] = data['sincere_oi_o_d_n'].abs()
data['intelligence_oi_o_d_n_abs'] = data['intelligence_oi_o_d_n'].abs()
data['funny_oi_o_d_n_abs'] = data['funny_oi_o_d_n'].abs()
data['ambitious_oi_o_d_n_abs'] = data['ambitious_oi_o_d_n'].abs()
data['shared_interests_oi_o_d_n_abs'] = data['shared_interests_oi_o_d_n'].abs()

# Difference between my importance and my rating of opposite
data['attractive_i_p_d_n'] = (data['attractive_i'] - data['attractive_p_n'])
data['sincere_i_p_d_n'] = (data['sincere_i'] - data['sincere_p_n'])
data['intelligence_i_p_d_n'] = (data['intelligence_i'] - data['intelligence_p_n'])
data['funny_i_p_d_n'] = (data['funny_i'] - data['funny_p_n'])
data['ambitious_i_p_d_n'] = (data['ambitious_i'] - data['ambitious_p_n'])
data['shared_interests_i_p_d_n'] = (data['shared_interests_i'] - data['shared_interests_p_n'])

# Absolute difference of my importance and my rating of opposite
data['attractive_i_p_d_n_abs'] = data['attractive_i_p_d_n'].abs()
data['sincere_i_p_d_n_abs'] = data['sincere_i_p_d_n'].abs()
data['intelligence_i_p_d_n_abs'] = data['intelligence_i_p_d_n'].abs()
data['funny_i_p_d_n_abs'] = data['funny_i_p_d_n'].abs()
data['ambitious_i_p_d_n_abs'] = data['ambitious_i_p_d_n'].abs()
data['shared_interests_i_p_d_n_abs'] = data['shared_interests_i_p_d_n'].abs()

# Changing from [1-10] scale to percentage for activities
data['a'] = data['sports'] + data['tvsports'] + data['exercise'] + data['dining'] + data['museums'] + data['art'] + data['hiking'] + data['gaming'] + data['clubbing'] + data['reading'] + data['tv'] + data['theater'] + data['movies'] + data['concerts'] + data['music'] + data['shopping'] + data['yoga']
data['sports_n'] = (data['sports'] / data['a']) 
data['tvsports_n'] = (data['tvsports'] / data['a']) 
data['exercise_n'] = (data['exercise'] / data['a']) 
data['dining_n'] = (data['dining'] / data['a']) 
data['museums_n'] = (data['museums'] / data['a']) 
data['art_n'] = (data['art'] / data['a']) 
data['hiking_n'] = (data['hiking'] / data['a']) 
data['gaming_n'] = (data['gaming'] / data['a']) 
data['clubbing_n'] = (data['clubbing'] / data['a']) 
data['reading_n'] = (data['reading'] / data['a']) 
data['tv_n'] = (data['tv'] / data['a']) 
data['theater_n'] = (data['theater'] / data['a']) 
data['movies_n'] = (data['movies'] / data['a']) 
data['concerts_n'] = (data['concerts'] / data['a']) 
data['music_n'] = (data['music'] / data['a']) 
data['shopping_n'] = (data['shopping'] / data['a']) 
data['yoga_n'] = (data['yoga'] / data['a']) 

del data['a']

data.to_csv('Speed_Dating_Clean_noSMOTE.csv', index = False)

# Create train and test data for Darwin (no SMOTE)
data_Y = data['match']
data_X = data.drop(['match'], axis = 1)

train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size = 0.20, random_state = 11)
train_data = pd.concat([train_X, train_Y], axis = 1)
train_data.to_csv('Speed_Dating_Clean_noSMOTE_train.csv', index = False)
test_data = pd.concat([test_X, test_Y], axis = 1)
test_data.to_csv('Speed_Dating_Clean_noSMOTE_test.csv', index = False) 

# SMOTE

In [None]:
data_Y = data['match']
data_X = data.drop(['match'], axis = 1)

sm = SMOTE(sampling_strategy = 'minority', random_state = 11)
smote_X, smote_Y = sm.fit_sample(data_X, data_Y)

print(smote_X.shape)
print(smote_Y.shape)
smote_X = DataFrame(smote_X, columns = data_X.columns)
smote_Y = Series(smote_Y, name = 'match')
data = pd.concat([smote_X, smote_Y], axis = 1)
data.head()

data.to_csv('Speed_Dating_Clean_SMOTE.csv', index = False)

# Create train and test data for Darwin (SMOTE)
data_Y = data['match']
data_X = data.drop(['match'], axis = 1)

train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size = 0.20, random_state = 11)
train_data = pd.concat([train_X, train_Y], axis = 1)
train_data.to_csv('Speed_Dating_Clean_SMOTE_train.csv', index = False)
test_data = pd.concat([test_X, test_Y], axis = 1)
test_data.to_csv('Speed_Dating_Clean_SMOTE_test.csv', index = False) 

# Features and Label

In [None]:
data_Y = data['match']
data_X = data.drop(['match'], axis = 1)
print(data_X.shape)
data_X.head(5) 

# Preprocessing

In [None]:
scaled_X = StandardScaler().fit_transform(data_X)

pca = PCA(0.95, random_state = 11)
pca_X = pca.fit_transform(scaled_X)

In [None]:
print(pca_X.shape)
pd.DataFrame(pca_X).head(5)

# Decision Tree

In [None]:
%%time

clf = DecisionTreeClassifier(random_state = 11)

# Set parameters
params = {"criterion" : ['gini', 'entropy'],
          "max_depth": [5, 10, 15, 20],
          "max_features": ['sqrt', 'log2'],
          "min_samples_leaf": [5, 10, 15, 20]
         }

# Find best parameters
grid_search = GridSearchCV(clf, params, cv = 10)
grid_search.fit(pca_X, data_Y)

# Print results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(pca_X)))

# Random Forest

In [None]:
%%time

clf = RandomForestClassifier(random_state = 11)

# Set parameters
params = {"criterion" : ['gini', 'entropy'],
          "max_depth": [5, 10, 15, 20],
          "max_features": ['sqrt', 'log2'],
          "min_samples_leaf": [5, 10, 15, 20]
         }

# Find best parameters
grid_search = GridSearchCV(clf, params, cv = 5)
grid_search.fit(pca_X, data_Y)

# Print results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(pca_X)))

# AdaBoost

In [None]:
%%time

clf = AdaBoostClassifier(random_state = 11)

# Set parameters
params = {"n_estimators": list(range(50, 150, 10))}

# Find best parameters
grid_search = GridSearchCV(clf, params, cv = 5)
grid_search.fit(pca_X, data_Y)

# Print results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(pca_X)))

# GradientBoosting

In [None]:
%%time

clf = GradientBoostingClassifier(random_state = 11)

# Set parameters
params = {"n_estimators": list(range(100, 200, 10))}

# Find best parameters
grid_search = GridSearchCV(clf, params, cv = 5)
grid_search.fit(pca_X, data_Y)

# Print results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(pca_X)))

# KNN

In [None]:
%%time

knn_scaled_X = MinMaxScaler().fit_transform(data_X)

pca = PCA(0.95, random_state = 11)
knn_X = pca.fit_transform(knn_scaled_X)

knn = KNeighborsClassifier()

# Set parameters
params = {'n_neighbors': list(range(1, 50, 2))}

# Find best parameters
grid_search = GridSearchCV(knn, params, cv = 5)
grid_search.fit(knn_X, data_Y)

# Print results
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(knn_X)))

# SVM

In [None]:
%%time

scaler = StandardScaler()

# Creating svc. Here we use the default instead of linear as mentioned on the piazza post. 
svc_clf = SVC(random_state = 11)

# Creating the pipeline
pipe = Pipeline(steps = [('sca', scaler ), ('svc', svc_clf)])

# Pass the pipeline in to a cross_val_score 
scores = cross_val_score(pipe, data_X, data_Y)

# Printing the average accuracy
print('Average Accuracy:', scores.mean() * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(data_X)))

# Naive Bayes

In [None]:
%%time

# Create NB clf and fit it
clf_rf = GaussianNB()

# Cross validation
scores = cross_val_score(clf_rf, data_X, data_Y, cv = 10)                                         
print("Accuracy with 10-fold cross validation:", scores.mean() * 100)

# Print confusion matrix and classification report
pred_Y = cross_val_predict(clf_rf, data_X, data_Y, cv = 10)
print('The confusion matrix is:\n', confusion_matrix(data_Y, pred_Y))
print('\n', classification_report(data_Y, pred_Y))

# Neural Networks

In [None]:
%%time

scaler = StandardScaler()
clf = MLPClassifier(random_state = 11)

# Create pipeline
pipe = Pipeline(steps = [('scaler', scaler), ('clf', clf)])

# Set parameters
param_grid = {
    'clf__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,), (70,), (80,), (90,), (100,)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu']
}

grid_search = GridSearchCV(pipe, param_grid, iid = False, cv = 5)

# Fit data and print results
grid_search.fit(data_X, data_Y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

# Print classification report
print("\n", classification_report(data_Y, grid_search.predict(data_X)))

# Darwin (SMOTE)

In [None]:
ds = DarwinSdk()
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('EMMIECHNG@UTEXAS.EDU', 'QMCZgepW6u')

if not status:
    print(msg)

In [None]:
dataset_train = 'Speed_Dating_Clean_SMOTE_train.csv'
dataset_test = 'Speed_Dating_Clean_SMOTE_test.csv'

**Upload datasets**

In [None]:
# Upload train dataset to Darwin
data_SMOTE_train = pd.read_csv("Speed_Dating_Clean_SMOTE_train.csv")
status, dataset = ds.upload_dataset("Speed_Dating_Clean_SMOTE_train.csv")

if not status:
    print(dataset)

In [None]:
# Upload test dataset to Darwin
data_SMOTE_test = pd.read_csv("Speed_Dating_Clean_SMOTE_test.csv")
status, dataset = ds.upload_dataset("Speed_Dating_Clean_SMOTE_test.csv")

if not status:
    print(dataset)

In [None]:
ds.lookup_dataset()

**Clean datasets**

In [None]:
# Clean train dataset on Darwin
target = "match"
status, job_id = ds.clean_data(dataset_train, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# Clean test dataset on Darwin
target = "match"
status, job_id = ds.clean_data(dataset_test, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

**Create model**

In [None]:
# Create Darwin model
model = target + "_model" + ts
status, job_id = ds.create_model(dataset_names = dataset_train,
                                 model_name = model,
                                 max_train_time = '00:05')

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# Extra training for Darwin model
status, job_id = ds.resume_training_model(dataset_names = dataset_train, 
                                         model_name = model,
                                         max_train_time = '00:05')

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

# Best model decided by Darwin

**Analyze model**

In [None]:
# Analyze Darwin model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_train, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_test[target], prediction[target]))

**Model Darwin used**

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type)

# DeepNeuralNetwork

**Analyze DeepNeuralNetwork model**

In [None]:
# Analyze Darwin DeepNeuralNetwork model
status, artifact = ds.analyze_model(model, model_type = 'DeepNeuralNetwork')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'DeepNeuralNetwork')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'DeepNeuralNetwork')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_test[target], prediction[target]))

# RandomForest

**Analyze RandomForest model**

In [None]:
# Analyze Darwin RandomForest model
status, artifact = ds.analyze_model(model, model_type = 'RandomForest')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'RandomForest')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'RandomForest')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_test[target], prediction[target]))

# GradientBoosted

**Analyze GradientBoosted model**

In [None]:
# Analyze Darwin GradientBoosted model
status, artifact = ds.analyze_model(model, model_type = 'GradientBoosted')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'GradientBoosted')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'GradientBoosted')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_SMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_SMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_SMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_SMOTE_test[target], prediction[target]))

# Darwin (noSMOTE) : imbalanced data

In [None]:
ds = DarwinSdk()
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('EMMIECHNG@UTEXAS.EDU', 'QMCZgepW6u')

if not status:
    print(msg)

In [None]:
dataset_train = 'Speed_Dating_Clean_noSMOTE_train.csv'
dataset_test = 'Speed_Dating_Clean_noSMOTE_test.csv'

**Upload datasets**

In [None]:
# Upload train dataset to Darwin
data_noSMOTE_train = pd.read_csv("Speed_Dating_Clean_noSMOTE_train.csv")
status, dataset = ds.upload_dataset("Speed_Dating_Clean_noSMOTE_train.csv")

if not status:
    print(dataset)

In [None]:
# Upload test dataset to Darwin
data_noSMOTE_test = pd.read_csv("Speed_Dating_Clean_noSMOTE_test.csv")
status, dataset = ds.upload_dataset("Speed_Dating_Clean_noSMOTE_test.csv")

if not status:
    print(dataset)

In [None]:
ds.lookup_dataset()

**Clean datasets**

In [None]:
# Clean train dataset on Darwin
target = "match"
status, job_id = ds.clean_data(dataset_train, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# Clean test dataset on Darwin
target = "match"
status, job_id = ds.clean_data(dataset_test, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

**Create model**

In [None]:
# Create Darwin model
model = target + "_model" + ts
status, job_id = ds.create_model(dataset_names = dataset_train,
                                 model_name = model,
                                 max_train_time = '00:05')

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# Extra training for Darwin model
status, job_id = ds.resume_training_model(dataset_names = dataset_train, 
                                         model_name = model,
                                         max_train_time = '00:05')

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

# Best model decided by Darwin

**Analyze model**

In [None]:
# Analyze Darwin model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_train, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_test[target], prediction[target]))

**Model Darwin used**

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type)

# DeepNeuralNetwork

**Analyze DeepNeuralNetwork model**

In [None]:
# Analyze Darwin DeepNeuralNetwork model
status, artifact = ds.analyze_model(model, model_type = 'DeepNeuralNetwork')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'DeepNeuralNetwork')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'DeepNeuralNetwork')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_test[target], prediction[target]))

# RandomForest

**Analyze RandomForest model**

In [None]:
# Analyze Darwin RandomForest model
status, artifact = ds.analyze_model(model, model_type = 'RandomForest')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'RandomForest')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'RandomForest')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_test[target], prediction[target]))

# GradientBoosted

**Analyze GradientBoosted model**

In [None]:
# Analyze Darwin GradientBoosted model
status, artifact = ds.analyze_model(model, model_type = 'GradientBoosted')
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])

In [None]:
feature_importance[:20].plot.bar()

**Prediction on training dataset**

In [None]:
status, artifact = ds.run_model(dataset_name, model, model_type = 'GradientBoosted')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_train[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_train[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_train[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_train[target], prediction[target]))

**Prediction on testing data**

In [None]:
status, artifact = ds.run_model(dataset_test, model, model_type = 'GradientBoosted')
sleep(1)
ds.wait_for_job(artifact['job_name'])

status, prediction = ds.download_artifact(artifact['artifact_name'])
print("\n", prediction.head())

**Create plots comparing predictions with actual target**

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(data_noSMOTE_test[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print("\n", classification_report(data_noSMOTE_test[target], prediction[target]))

In [None]:
plt.plot(data_noSMOTE_test[target], prediction[target], '.')
plt.plot([0,2.3e7],[0,2.3e7],'--k')
print('\nR^2 : ', r2_score(data_noSMOTE_test[target], prediction[target]))