In [None]:
%pylab inline

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split #sklearn.cross_validation for lower python versions
from collections import defaultdict, Counter
from pprint import pprint
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
#from sklearn.externals import joblib
import joblib

And tune pandas display options:

In [None]:
pd.set_option('display.width', 3000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
os.getcwd()

In [None]:
# For Local use set Current Working directory path
# os.chdir('<path>')

# Current working directory must have all the input files required

In [None]:
path = os.getcwd().strip() + '/training_input (IQVIA derivation dataset).csv' #Input for training

## 2. Importing Model Input File

**Reading dataset from defined path:**

In [None]:
model_input = pd.read_csv(path)
print ('Base data has %i rows and %i columns' % (model_input.shape[0], model_input.shape[1]))
print ('Displaying first 5 rows')
model_input.head(5)

**Checking if there are any empty cells in model_input**

In [None]:
if model_input.isnull().values.any() == 'True':
    raise ValueError('Model Input File has null values')
else:
    print ('There are no empty values in the dataset')

**Level of data is patient_id. Each row uniquely identifies a patient**

In [None]:
row_count = model_input.shape[0]
patient_count = len(model_input['patient_id'].unique())
if row_count == patient_count:
    print ('Row Count: ', model_input.shape[0])
    print ('Patient Count: ', len(model_input['patient_id'].unique()))
else:
    raise ValueError('Model Input File is not at required level of data (patient_id)')

**ATTR WT patients are matched 1:1 with HF using Propensity Score Matching giving us 817 patients for each cohort ATTR WT and HF**

In [None]:
model_input.groupby(['cohort_type','cohort_flag']).patient_id.nunique()

cohort_flag is 1 for ATTR WT patients. We would drop cohort_type and use cohort_flag as target for training

## 3. Pre-processing

**Dropping columns not required for training the model**

In [None]:
model_input = model_input.drop(['patient_id','cohort_type'], axis = 1)
model_input.head(5)

**We are now going to handle the target variable (cohort_flag) and store it in a new variable:**

In [None]:
target_map = {u'1': 1, u'0': 0}
model_input['__target__'] = model_input['cohort_flag'].map(str).map(target_map)
model_input = model_input.drop(['cohort_flag'], axis = 1)

model_input.groupby(['__target__']).count()

In [None]:
potential_target_leaks = ['cardiomyopathy_in_diseases_classified_elsewhere','other_forms_of_heart_disease']

model_input_flt_leaks = model_input.drop(potential_target_leaks, axis = 1)
model_input_flt_leaks.head(5)

**Splitting model_input into test and train**

The dataset needs to be split into 2 new sets, one that will be used for training the model (train set)
and another that will be used to test its generalization capability (test set)

In [None]:
X = model_input_flt_leaks.drop('__target__', axis=1)
y = np.array(model_input_flt_leaks['__target__'])

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print ('Train data has %i rows and %i columns' % (train_X.shape[0], train_X.shape[1]))
print ('Test data has %i rows and %i columns' % (test_X.shape[0], test_X.shape[1]))

In [None]:
#pd.DataFrame(train_X).to_csv('train_X.csv')

**Displaying all features**

In [None]:
features = list(train_X)
print (len(features), ' features')
display(features)

## 4. Creating model object

In [None]:
 from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 500, num = 4)]

# Number of features to consider at every split
max_features = ['auto']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 20, num = 4)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2,3,5,7]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2,3,4,5]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
total_combinations = len(n_estimators)*len(max_features)*len(max_depth)*len(min_samples_split)*len(min_samples_leaf)*len(bootstrap)
print('Trying out total combinations: ', total_combinations)
print(param_grid)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=10000)

# Random search of parameters, using 3 fold cross validation, 
# search across different combinations
%time rf_grid = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)

# Fit the random search model
%time rf_grid.fit(train_X, train_y)

In [None]:
rf_grid.best_params_

In [None]:
bestparams_clf=rf_grid.best_params_
bestparams_clf

In [None]:
clf = RandomForestClassifier(n_estimators=bestparams_clf['n_estimators'],random_state= 10000, min_samples_leaf= bestparams_clf['min_samples_leaf'])
clf

In [None]:
%time clf.fit(train_X, train_y)

In [None]:
best_grid = rf_grid.best_estimator_
_clf_base_predictions = best_grid.predict(test_X)
_clf_base_probas = best_grid.predict_proba(test_X)

test_y_series = pd.Series(data=test_y, index=test_X.index, name='cohort_flag')
test_y_labels = test_y_series.to_frame()

clf_base_predictions = pd.Series(data=_clf_base_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]

clf_base_probabilities = pd.DataFrame(data=_clf_base_probas, index=test_X.index, columns=cols)

clf_base_results_test = test_y_labels.join(clf_base_predictions, how='left')
clf_base_results_test = clf_base_results_test.join(clf_base_probabilities, how='left')

In [None]:
# Model performance metrics

print ('Accuracy:'),print(metrics.accuracy_score(test_y, _clf_base_predictions))
print ('\nF1 Score:'),print(metrics.f1_score(test_y, _clf_base_predictions))
print ('\nRecall:'),print(metrics.recall_score(test_y, _clf_base_predictions))
print ('\nPrecision:'),print(metrics.precision_score(test_y, _clf_base_predictions))
print ('\nClassification report:'),print(metrics.classification_report(test_y, _clf_base_predictions))
print ('\nConfusion matrix:'),print(metrics.confusion_matrix(test_y, _clf_base_predictions))

In [None]:
clf_base_results_test

In [None]:
print ('Area under curve:', metrics.roc_auc_score(clf_base_results_test['cohort_flag'],clf_base_results_test['probability_of_value_1']))

In [None]:
feature_importances_data = []
features = train_X.columns
for feature_name, feature_importance in zip(features, clf.feature_importances_):
    feature_importances_data.append({
        'feature': feature_name,
        'importance': feature_importance
    })

# Plot the results
pd.DataFrame(feature_importances_data)\
    .set_index('feature')\
    .sort_values(by='importance')[-10::]\
    .plot(title='Top 10 most important variables',
          kind='barh',
          figsize=(10, 6),
          color='#348ABD',
          alpha=0.6,
          lw='1',
          edgecolor='#348ABD',
          grid=False,)

In [None]:
# Saving the feature importance

pd.DataFrame(feature_importances_data).to_csv('feature_importance.csv')

In [None]:
#handle = dataiku.Folder("0h3MrWHS")
path = os.getcwd()

filename = path+'/rf_best_excl_1_CM_3_6.sav'
joblib.dump(clf, filename)