Train random forest classifier on combination of 2-type and 3-type output confidence scores to simulate final label dependency section. 

Results saved to csv for submission.

In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [0]:
#load in data and merge data frames 
train_2_type = pd.read_csv('drive/My Drive/ICH/Inference/train_2type_conf_scores.csv')
train_3_type = pd.read_csv('drive/My Drive/ICH/Inference/train_3type_conf_scores.csv')
train_2_type['label'] = train_2_type['label'].apply(lambda x: x + '.png')

train = pd.merge(left=train_2_type, right=train_3_type, left_on='label', right_on='file')

train_scores = train[['file', 'injured', 'epidural', 'intraparenchymal', 'subarachnoid']]

train_labels = pd.read_csv('drive/My Drive/ICH/train_labels.csv')
train_labels['ID'] = train_labels['ID'].apply(lambda x: x + '.png')
train_labels.columns = [i + '-label' for i in train_labels.columns]

train_data = pd.merge(left=train_labels, right=train_scores, left_on='ID-label', right_on='file')
X = np.array(train_data[['injured',	'epidural',	'intraparenchymal',	'subarachnoid']])
y = np.array(train_data[['epidural-label',	'intraparenchymal-label',	'subarachnoid-label']])

In [104]:
#test simple cutoff method
def cutoff(row):
  if row[0] < 0.16:
    return [0,0,0]
  pred = []
  for r in range(1,4):
    if row[r] >= 0.5:
      pred.append(1)
    else: pred.append(0)
  return pred

preds = [cutoff(scores) for scores in X]

print(accuracy_score(y, preds))

0.6959728400842894


To model some further label interdependency in the absence of a 3 type RNN model, I will run finalk predictions through a random forest model.



In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy_score(y_test, preds)

0.7509755232351898

In [106]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.67      0.34      0.45       102
           1       0.88      0.84      0.86      1059
           2       0.79      0.72      0.76       995

   micro avg       0.84      0.76      0.80      2156
   macro avg       0.78      0.64      0.69      2156
weighted avg       0.83      0.76      0.79      2156
 samples avg       0.47      0.46      0.46      2156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model clearly struggles with epidural, likely due to the class imbalances in the dataset.

Train on whole train dataset ready for final test set inferences.

In [107]:
clf = RandomForestClassifier()
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
test_2_type = pd.read_csv('drive/My Drive/ICH/Inference/test_2type_conf_scores.csv')
test_3_type = pd.read_csv('drive/My Drive/ICH/Inference/test_3type_conf_scores.csv')
test_2_type['label'] = test_2_type['label'].apply(lambda x: x + '.png')

In [0]:
test_data = pd.merge(left=test_2_type, right=test_3_type, left_on='label', right_on='file')
test_X = np.array(test_data[['injured', 'epidural', 'intraparenchymal', 'subarachnoid']])

In [0]:
test_preds = clf.predict(test_X)
test_cls = pd.DataFrame(test_preds, columns=['epidural', 'intraparenchymal', 'subarachnoid'])
test_cls['filename'] = test_data['file']
test_cls.to_csv('drive/My Drive/ICH/Inference/predictions.csv')

In [0]:
test_probs = test_data[['epidural', 'intraparenchymal', 'subarachnoid']]
test_probs.to_csv('drive/My Drive/ICH/Inference/prediction_probabilities.csv')