In [1]:
# Importing the library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import matplotlib.pyplot as plt # plotting library
from IPython.display import display # Manage multiple output per cell

In [2]:
# Manage UTF-8 issue
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

In [3]:
# Importing the dataset
df = pd.read_csv("../swissmarket-model-vaud-v1-1-out.csv")
df = df.drop(df.columns[0], axis=1)
X = df[df.columns.drop('isSelling')]
y = df['isSelling']

In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [5]:
y_train.value_counts()

False    91716
True      9452
Name: isSelling, dtype: int64

In [6]:
y_test.value_counts()

False    22846
True      2446
Name: isSelling, dtype: int64

In [7]:
# Imputation transformer for completing missing values (NaN) with the mean
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
imp = imp.fit(X_test)
X_test = imp.transform(X_test)

In [8]:
# Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.fit_transform(X_test)

# Random Forest

In [9]:
# Importing Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)

In [10]:
# Applying Grid Search to find the best hyper-parameters for our Logistic Regression Model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.classification import log_loss
from sklearn.metrics import make_scorer
# parameters = [{'n_estimators': [3, 10, 30, 100, 300, 1000, 3000, 10000], 
#                'criterion': ['gini', 'entropy'], 
#                'max_features': ['sqrt', 'log2', 'None']
#               },
#              {'n_estimators': [3, 10, 30, 100, 300, 1000, 3000, 10000], 
#                'criterion': ['gini', 'entropy'], 
#                'max_features': [3, 10, 30, 60]
#               }]
parameters = [{'n_estimators': [100], 
               'criterion': ['gini'], 
               'max_features': ['sqrt']
              }]
clf = GridSearchCV(estimator=classifier,
                   param_grid=parameters,
                   scoring=make_scorer(log_loss),
                   cv=4,
                   n_jobs=-1)
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Extract best score calculated with the GridSearchCV
best_score = clf.best_score_
best_score

In [None]:
# Extract best hyper-parameter calculated with the GridSearchCV
best_params = clf.best_params_
best_params

In [None]:
# Evaluating scoring parameters
from sklearn.model_selection import cross_val_score
# Accuracy
accuracy_score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring='accuracy')
display('accuracy_score:',accuracy_score.mean(), accuracy_score.std())
# precision
precision_score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring='precision')
display('precision_score:',precision_score.mean(), precision_score.std())
# recall
recall_score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring='recall')
display('recall_score:',recall_score.mean(), recall_score.std())
# f1
f1_score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring='f1')
display('f1_score:',f1_score.mean(), f1_score.std())
# Cross entropy
from sklearn.metrics.classification import log_loss
log_loss_score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring=make_scorer(log_loss, greater_is_better=False))
display('log_loss_score:',log_loss_score.mean(), log_loss_score.std())

In [None]:
# Fitting Random Forest to the training set with fine tune hyper-parameters


In [None]:
# Predicting the Test set result
y_pred = clf.predict(X_test)
y_pred

In [None]:
# Calculate target classification probability (got % instead of 1 or 0)
y_probs = clf.predict_proba(X_test)
y_probs

In [None]:
# Making the confusion Matrix
df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

In [None]:
# Plot of a ROC curve
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.ravel(), y_probs[:, 1].ravel())
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Check Important Feature
clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                             criterion=best_params['criterion'], 
                             max_features=best_params['max_features'], 
                             random_state=0)
clf.fit(X_train, y_train)
list(zip(df.columns.drop('isSelling'), clf.feature_importances_))