In [1]:
import sys
sys.path.append("../../")

In [6]:
import numpy as np
import pandas as pd
from trialexplorer import studysimilarity as ssim
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

### Loading the labeled data

In [10]:
df = pd.read_pickle('../../training_data/all.p')
df.shape

(100142, 11)

In [11]:
df.head()

Unnamed: 0,cond1,cond2,label,full_fuzzy_ratio,noun_fuzzy_ratio,bing_bagoword_dist,bing_link_sim,same_wiki,stage_dist,adj_dist,vb_dist
0,Parkinson's Disease,Parkinson Disease,same,94.0,100.0,14.0,0.25,1.0,0.0,0.0,0.0
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual,78.0,100.0,12.0,0.08,1.0,0.0,1.0,0.0
2,Parkinson's Disease,Parkinson's Disease (PD),same,88.0,92.0,10.0,0.380952,1.0,0.0,0.0,0.0
3,Parkinson's Disease,Parkinson,same,64.0,69.0,2.0,0.142857,1.0,0.0,0.0,0.0
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual,72.0,100.0,18.0,0.08,1.0,0.0,1.0,0.0


In [12]:
df.columns

Index(['cond1', 'cond2', 'label', 'full_fuzzy_ratio', 'noun_fuzzy_ratio',
       'bing_bagoword_dist', 'bing_link_sim', 'same_wiki', 'stage_dist',
       'adj_dist', 'vb_dist'],
      dtype='object')

In [25]:
feat_columns = ['full_fuzzy_ratio',
       'noun_fuzzy_ratio',
       'bing_bagoword_dist',
       'bing_link_sim',
       'same_wiki',
       'stage_dist', 
       'adj_dist',
       'vb_dist']

no_bing_feats = ['full_fuzzy_ratio',
       'noun_fuzzy_ratio',
       'stage_dist', 
       'adj_dist',
       'vb_dist']

In [26]:
X = df[feat_columns].values
X_no_bing = df[no_bing_feats].values

X.shape

(100142, 8)

In [27]:
y = df['label'].values
y.shape

(100142,)

In [28]:
df.groupby('label').size()

label
notsame           56992
same               2702
same_diff_qual    40448
dtype: int64

### Train Test Split

In [17]:
test_size = X.shape[0] // 3
test_size

33380

In [18]:
all_idx = np.arange(X.shape[0])
test_idx = sorted(np.random.choice(all_idx, test_size, replace=False))
train_idx = np.delete(all_idx, test_idx)

len(test_idx), len(train_idx)

(33380, 66762)

In [29]:
X_train = X[train_idx]
X_train_no_bing = X_no_bing[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
X_test_no_bing = X_no_bing[test_idx]
y_test = y[test_idx]

In [30]:
len(X_train), len(y_train), len(X_test), len(y_test)

(66762, 66762, 33380, 33380)

## Fitting Model

In [21]:
rfc = RandomForestClassifier(n_estimators=100)

In [22]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Training Accuracy

In [23]:
rfc.score(X_train, y_train)

0.9892154219466164

# Test Accuracy

In [24]:
rfc.score(X_test, y_test)

0.9699520671060515

## No Bing Features

In [31]:
rfc_nb = RandomForestClassifier(n_estimators=100)

In [32]:
rfc_nb.fit(X_train_no_bing, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
rfc_nb.score(X_train_no_bing, y_train)

0.926619933495102

In [34]:
rfc_nb.score(X_test_no_bing, y_test)

0.9069203115638107

In [42]:
preds = rfc.predict(X_test)

In [57]:
recall_score(y_test, preds, average='weighted')

0.9687837028160575

In [59]:
precision_score(y_test, preds, average='weighted')

0.9688002315858766

## Examples of Misclassification

In [43]:
misclassified = np.argwhere(preds != y_test)

In [44]:
preds[misclassified]

array([['notsame'],
       ['same_diff_qual'],
       ['same_diff_qual'],
       ...,
       ['same_diff_qual'],
       ['same_diff_qual'],
       ['same_diff_qual']], dtype=object)

In [45]:
y_test[misclassified]

array([['same_diff_qual'],
       ['notsame'],
       ['same'],
       ...,
       ['notsame'],
       ['notsame'],
       ['notsame']], dtype=object)

In [46]:
misclassified_idx = np.array(test_idx)[misclassified].ravel()
misclassified_idx

array([    47,    213,    231, ...,  99311,  99681, 100138])

In [47]:
df_mis = df.reset_index().reindex(misclassified_idx)

In [48]:
df_mis['pred'] = preds[misclassified]

In [49]:
df_mis.columns

Index(['index', 'cond1', 'cond2', 'label', 'full_fuzzy_ratio',
       'noun_fuzzy_ratio', 'bing_bagoword_dist', 'bing_link_sim', 'same_wiki',
       'stage_dist', 'adj_dist', 'vb_dist', 'pred'],
      dtype='object')

In [51]:
df_mis[['cond1', 'cond2', 'label', 'pred'] + feat_columns].to_excel('training_data/misclassified.xls')

## Examples of correct classification

In [60]:
rightclassified = np.argwhere(preds == y_test)

In [61]:
rightclassified_idx = np.array(test_idx)[rightclassified].ravel()
rightclassified_idx

array([     2,      5,      7, ..., 100133, 100136, 100137])

In [62]:
df_right = df.reset_index().reindex(rightclassified_idx)

In [64]:
df_right['pred'] = preds[rightclassified]

In [65]:
df_right[['cond1', 'cond2', 'label', 'pred'] + feat_columns].to_excel('training_data/rightclassified.xls')