In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from statsmodels.api import datasets
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

  from numpy.core.umath_tests import inner1d
  from pandas.core import datetools


In [2]:
Features = pd.read_csv('train.csv')
Labels = pd.read_csv('classification_labels.csv')
new_test = pd.read_csv('test.csv')
Labels = Labels.reshape(-1,)

In [3]:
print(Features.shape)
print(Labels.shape)

(16404, 26)
(16404,)


In [5]:
nr.seed(234)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(432)
outside = ms.KFold(n_splits=10, shuffle = True)

In [6]:
param_grid = {"max_features": [2, 3, 5, 10, 15], "min_samples_leaf":[3, 5, 10, 20]}
rf_clf = RandomForestClassifier(class_weight='balanced')
nr.seed(5678)
rf_clf = ms.GridSearchCV(estimator=rf_clf, param_grid=param_grid, scoring= 'roc_auc',
                         cv = inside, return_train_score = True)
rf_clf.fit(Features, Labels)
print(rf_clf.best_estimator_.max_features)
print(rf_clf.best_estimator_.min_samples_leaf)

10
20


In [7]:
nr.seed(354)
cv_estimate = ms.cross_val_score(rf_clf, Features, Labels, cv = outside )
print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean performance metric = 0.859
SDT of the metric       = 0.008
Outcomes by cv fold
Fold  1    0.853
Fold  2    0.863
Fold  3    0.851
Fold  4    0.852
Fold  5    0.873
Fold  6    0.861
Fold  7    0.867
Fold  8    0.857
Fold  9    0.865
Fold 10    0.846


In [3]:
nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 5000)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [4]:
#Delete
nr.seed(1115)
rf_mod = RandomForestClassifier(class_weight = "balanced", 
                                max_features = 10, 
                                min_samples_leaf = 20) 
rf_mod.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=10,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [33]:
X_train.shape, X_test.shape

((11519, 26), (5000, 26))

In [9]:
nr.seed(1115)
rf_mod = RandomForestClassifier(class_weight = "balanced", 
                                max_features = rf_clf.best_estimator_.max_features, 
                                min_samples_leaf = rf_clf.best_estimator_.min_samples_leaf) 
rf_mod.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=10,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [10]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[1,1] + '             %5d' % conf[1,0])
    print('Actual negative    %6d' % conf[0,1] + '             %5d' % conf[0,0])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = rf_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5)     

                 Confusion matrix
                 Score positive    Score negative
Actual positive      1309               407
Actual negative       730              2554

Accuracy        0.77
AUC             0.86
Macro precision 0.75
Macro recall    0.77
 
           Positive      Negative
Num case     3284          1716
Precision    0.86          0.64
Recall       0.78          0.76
F1           0.82          0.70


In [14]:
new_test = np.array(new_test)

In [40]:
new_score.shape

(500,)

In [15]:
new_test_probs = rf_mod.predict_proba(new_test)

In [16]:
new_score = score_model(new_test_probs, 0.5)

In [37]:
pd.DataFrame(new_score).to_csv('testPredictions.csv', index = False)

In [17]:
for i in new_score:
    print(i)

0
1
0
0
0
0
1
1
1
0
1
0
0
0
0
0
0
0
1
1
0
1
1
1
0
0
0
1
1
0
1
1
0
0
1
1
1
0
0
0
1
1
0
0
0
0
0
1
1
1
0
1
1
0
0
1
0
0
0
0
0
0
0
1
0
1
1
1
0
1
1
0
0
0
0
1
1
1
0
0
1
1
0
1
0
0
1
1
1
0
0
1
0
1
0
1
0
0
0
0
1
0
0
1
1
0
0
0
0
0
1
0
1
0
0
1
0
0
0
1
1
1
0
0
1
0
0
1
1
0
0
1
1
0
0
1
1
0
1
0
0
0
1
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
1
1
0
0
1
1
1
0
1
1
0
0
1
1
0
0
1
0
1
0
0
1
0
0
1
0
0
0
1
1
0
1
0
0
0
0
0
1
1
0
0
1
0
0
0
0
1
0
1
1
0
1
1
0
1
0
1
1
0
0
1
1
0
1
1
1
0
0
0
1
1
0
1
0
0
0
1
0
1
0
0
0
1
0
1
0
1
0
0
1
1
0
1
0
1
0
1
1
0
1
1
0
0
0
1
0
1
1
0
1
1
0
1
1
0
0
1
0
1
0
1
0
0
0
0
0
1
0
0
0
0
1
1
0
0
1
0
1
1
0
0
0
0
0
1
1
0
0
1
1
1
1
0
1
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
0
0
1
0
1
0
1
1
1
0
1
0
0
1
1
1
0
0
1
0
1
0
0
1
0
1
0
1
1
1
0
0
1
1
0
1
1
0
1
0
0
0
1
0
0
0
0
0
1
0
0
1
1
1
0
0
0
0
1
1
0
0
0
0
1
0
0
1
1
0
0
0
0
1
1
1
1
0
1
0
1
0
0
0
0
0
1
0
1
1
0
0
0
0
0
1
0
0
0
0
0
1
0
1
0
1
1
0
1
1
0
0
0
1
0
0
0
1
0
1
0
1
0
1
0
0
0
0
1
0
0
0
0
0
1
0
1
1
0
1
1
0
0
0
1
0
0
0
0
1
1
1
1
1
1
0
0
0
1
0
0
1
1
1
1
0
0
1
1
1
0
0
