In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import pickle
import os
import re
import sys
import warnings

warnings.filterwarnings('ignore')

sys.path.insert(0,'../..')

from methods.bag_of_ngrams.processing import cleanSplit, getTrainedVectorizer, STRIPCHARS
from methods.extraction.general import getCounter, sampleTrain
from methods.extraction.token import getX, getY
from methods.sklearn_calibration import *
from random import sample
from scipy.sparse import csr_matrix, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from pyfunctions.general import *
from pyfunctions.pathology import getProstateStageInverseMapping, label_correctness

# 1. Set up: data cleaning and preprocessing

In [None]:
# Set arguments
args = {'domain': 'prostate',
        'target_fields': ['ProstateWeight', 'TumorVolume', 'TStage', 'NStage', 'MStage'],
        'sample': 20000, # Number of positive and negative tokens to sample for training
        'N': 3, # N in N-grams
        'k': 5 # Size of context
        }

# Read in data
path = "../../data/" + args['domain'] + ".json"
data = readJson(path)

# Process reports
data = cleanSplit(data, STRIPCHARS)

# Get counters and vectorizers
trainReports = extractListFromDic(data['train'], 'clean_document')

# Maps text to vectors based on counts of words
args['vectorizer'] = getTrainedVectorizer(trainReports, args['N'], 1)
args['counter'] = getCounter(trainReports)
args['stage_mapping'] = getProstateStageInverseMapping()

# 2. Save document matrices and label arrays

- Represent each document as a matrix where rows represents each word in the report and the columns represent the features of each word (context words and word type)
- Label is 1 if word matches ground-truth and 0 if not

## 2.1 Training data

In [None]:
X_train = []
y_train = {field:[] for field in args['target_fields']}

for i, patient in enumerate(data['train']):
    try:
        X = getX(patient['clean_document'], args)
        X_train.append(csr_matrix(X))

        for field in args['target_fields']:
            stage = 'stage' in field.lower()
            ys = np.zeros(len(patient['clean_document'].split()))
            keyLabels =  extractListFromDic(data['train'], 'labels', field)
            y = getY(patient['clean_document'], keyLabels[i], args, stage)
            y_train[field] = y_train[field] + y.tolist()
            
    except Exception as e:
        print(e)

## 2.3 Validation and test data
- Calculate these matrices for validation and test data

In [None]:
X_val, X_test = [], []
val_set, test_set = [], []

for i, patient in enumerate(data['val']):
    try:
        X_val.append(getX(patient['clean_document'], args))
        val_set.append(patient)
    except Exception as e:
        print(e)
    
for i, patient in enumerate(data['test']):
    try:
        X_test.append(getX(patient['clean_document'], args))
        test_set.append(patient)
    except Exception as e:
        print(e)

# 3. Train token extraction model

In [None]:
"""
* Set parameters and models
"""
params = {'bootstrap': [True, False],
          'max_depth': [10, 20, 30, 40, 50, None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64],
          'n_estimators': [200, 400, 600, 800, 1000]}

args['key'] = 'ProstateWeight'

"""
* Run model and return probabilities of tokens
"""
X_train_sampled, y_train_sampled = sampleTrain(vstack(X_train), np.array(y_train[args['key']]), args)

clf = RandomForestClassifier(class_weight = 'balanced')
#random_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=40, cv=3, n_jobs=40)
random_search = clf
random_search.fit(X_train_sampled, y_train_sampled)

# 4. Extract predictions

## a) Val set

In [None]:
val_predictions = pd.DataFrame(columns = ['label', 'predicted_token', 'y_prob', 'word_ind'])

for i, patient in enumerate(val_set):
    y_proba = random_search.predict_proba(X_val[i])
    y_proba = [p[1] for p in y_proba]
    
    inds = np.argsort(y_proba)[::-1]
    best = patient['clean_document'].split()[inds[0]]
    word_ind = inds[0]
    val_predictions.loc[i] = [patient['labels'][args['key']], best, y_proba[inds[0]], word_ind]

## b) Test set

In [None]:
test_predictions = pd.DataFrame(columns = ['label', 'predicted_token', 'y_prob', 'word_ind'])

for i, patient in enumerate(test_set):    
    y_proba = random_search.predict_proba(X_test[i])
    y_proba = [p[1] for p in y_proba]
    
    inds = np.argsort(y_proba)[::-1]
    best = patient['clean_document'].split()[inds[0]]
    word_ind = inds[0]
    test_predictions.loc[i] = [patient['labels'][args['key']], best, y_proba[inds[0]], word_ind]

# 5. Label correctness

In [None]:
val_predictions = label_correctness(val_predictions, args['key'])
test_predictions = label_correctness(test_predictions, args['key'])

# 6. Calibration and expected calibration error

In [None]:
X_val_cal = val_predictions['y_prob']
y_val_cal = val_predictions['correct']
X_val_cal = np.array(X_val_cal).reshape(len(y_val_cal))

if len(np.unique(y_val_cal)) > 1:
    reg = IsotonicRegression()
    reg.fit(X_val_cal, y_val_cal)

    X_test_cal = test_predictions['y_prob']
    y_test_cal = test_predictions['correct']
    X_test_cal = np.array(X_test_cal).reshape(len(y_test_cal))

    X_test_cal[X_test_cal < reg.X_min_] = reg.X_min_
    X_test_cal[X_test_cal > reg.X_max_] = reg.X_max_

    test_predictions['calibrated_score'] = reg.predict(X_test_cal)

    ece_error = ece_mce_error(reg.predict(X_test_cal), test_predictions['final_prediction'], test_predictions['label'],
                              num_bins=10, plot=None)
    
    print('expected calibration error:', ece_error[0])
else:
    # Nothing to do here (cannot calibrate with only 1 class)
    test_predictions['calibrated_score'] = test_predictions['y_prob']