In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [7]:
humane = pd.read_csv("C:/Users/zhang/Desktop/NYU/2019 Fall/Humane/processed_data.csv", index_col = 0)

In [8]:
# x_train, x_test, y_train, y_test = train_test_split(humane.drop('label', axis = 1), humane['label'], test_size = 0.2)

In [20]:
# train = pd.merge(x_train, y_train, left_index = True, right_index = True)
# test = pd.merge(x_test, y_test, left_index = True, right_index =  True)

In [21]:
# train.to_csv('train.csv')
# test.to_csv('test.csv')

In [2]:
train = pd.read_csv('train.csv', index_col = 0)
test = pd.read_csv('test.csv', index_col = 0)

In [41]:
x_train = train.drop('label', axis = 1)
y_train = train['label']
x_test = test.drop('label', axis = 1)
y_test = test['label']

## Random Forest

In [32]:
parameters_rf = {'n_estimators' : [50, 100, 200, 500], 'max_depth' : [10, 50, 100, 200], 'min_samples_leaf' : [10, 20, 50, 100]}
rf = RandomForestClassifier(criterion = 'entropy')
clf_rf = GridSearchCV(rf, parameters_rf, cv = 5, scoring = 'roc_auc')
clf_rf.fit(x_train.drop('id', axis = 1), y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                

In [33]:
clf_rf.best_estimator_ 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
print('Validation auc is ' + str(clf_rf.best_score_ ))

Validation auc is 0.7778191882173653


In [43]:
test_auc_rf = roc_auc_score(y_test, clf_rf.predict_proba(x_test.drop('id', axis = 1))[:, 1])
print('Test auc is ' + str(test_auc_rf))

Test auc is 0.7761794399123041


## Elastic Net

In [43]:
scaler = StandardScaler()
scaler.fit(x_train.drop('id', axis = 1))
scaled_x_train = pd.DataFrame(scaler.transform(x_train.drop('id', axis = 1)), columns = x_train.columns[1:])
scaled_x_test = pd.DataFrame(scaler.transform(x_test.drop('id', axis = 1)), columns = x_test.columns[1:])

In [45]:
parameters_lr = {'C' : [0.01, 0.1, 1, 10, 100], 'l1_ratio' : [0.2, 0.5, 0.8]}
# parameters_en = {'C' : [0.01], 'l1_ratio' : [0.2]}
lr = LogisticRegression(penalty = 'elasticnet', max_iter = 5000, solver = 'saga')
clf_lr = GridSearchCV(lr, parameters_lr, cv = 5, scoring = 'roc_auc')
clf_lr.fit(scaled_x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=5000, multi_class='warn',
                                          n_jobs=None, penalty='elasticnet',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'l1_ratio': [0.2, 0.5, 0.8]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [46]:
clf_lr.best_estimator_ 

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.8, max_iter=5000,
                   multi_class='warn', n_jobs=None, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
print('Validation auc is ' + str(clf_lr.best_score_ ))

Validation auc is 0.764521551818262


In [81]:
test_auc_lr = roc_auc_score(y_test, clf_lr.predict_proba(scaled_x_test)[:, 1])
print('Test auc is ' + str(test_auc_lr))

Test auc is 0.7666441542232386


## Gradient Boosting

In [48]:
parameters_gb = {'n_estimators' : [100, 500], 'max_depth' : [10, 100]}
gb = GradientBoostingClassifier()
clf_gb = GridSearchCV(gb, parameters_gb, cv = 5, scoring = 'roc_auc')
clf_gb.fit(x_train.drop('id', axis = 1), y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

In [49]:
clf_gb.best_estimator_ 

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [50]:
print('Validation auc is ' + str(clf_gb.best_score_ ))

Validation auc is 0.7598858765541996


In [51]:
test_auc_gb = roc_auc_score(y_test, clf_gb.predict_proba(x_test.drop('id', axis = 1))[:, 1])
print('Test auc is ' + str(test_auc_gb))

Test auc is 0.7563087872430401


## MLP

In [11]:
parameters_mlp = {'alpha' : [0.01, 0.1, 1, 10]}
mlp = MLPClassifier(hidden_layer_sizes = (100, 30), batch_size = 32)
clf_mlp = GridSearchCV(mlp, parameters_mlp, cv = 5, scoring = 'roc_auc')
clf_mlp.fit(x_train.drop('id', axis = 1), y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size=32, beta_1=0.9, beta_2=0.999,
                                     early_stopping=False, epsilon=1e-08,
                                     hidden_layer_sizes=(100, 30),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=None, param_grid={'alpha': [0.01, 0.1, 1, 10]},
             pre_dispatch

In [12]:
clf_mlp.best_estimator_ 

MLPClassifier(activation='relu', alpha=0.1, batch_size=32, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 30), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [13]:
print('Validation auc is ' + str(clf_mlp.best_score_ ))

Validation auc is 0.697464295527056


In [14]:
test_auc_mlp = roc_auc_score(y_test, clf_mlp.predict_proba(x_test.drop('id', axis = 1))[:, 1])
print('Test auc is ' + str(test_auc_mlp))

Test auc is 0.7413635869096642
