In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import numpy as np
import pandas as pd
import pickle
import sys
import torch
import warnings

warnings.filterwarnings('ignore')

basedir = "/media/pathologyhd/path_nlp/pathparsing/"
sys.path.append(basedir + "prostate-open-source/")

from methods.bag_of_ngrams.processing import (cleanReport, cleanReports, cleanSplit, getCounter, 
                                              getTrainedVectorizer, STRIPCHARS, unkReports)
from methods.sklearn_calibration import *
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
from pyfunctions.general import *

# 1. Set up

In [3]:
args = {'domain': 'prostate',
        'epochs': 20,
        'embeddingDim': 300,
        'maxDocLength': 1346,
        'path': basedir + "prostate-open-source/",
        'target_fields': ['TreatmentEffect','TumorType','PrimaryGleason','SecondaryGleason','TertiaryGleason',
                          'SeminalVesicleNone','LymphNodesNone','MarginStatusNone','ExtraprostaticExtension',
                          'PerineuralInfiltration','RbCribriform','BenignMargins'],
        'n_tries': 20 # Number of random search candidates
        }

# Read in data
path = args['path'] + "data/splits/" + args['domain'] + ".json"
data = readJson(path)

# Process reports
data = cleanSplit(data, STRIPCHARS)

# Unk rare words
counter = getCounter(data['train'])
data['train'] = unkReports(data['train'], counter)
data['val'] = unkReports(data['val'], counter)
data['test'] = unkReports(data['test'], counter)

# 2. Vectorize text and train model

In [4]:
# Random search parameters
params = { 'C': np.logspace(-6,6,1000)}

field = args['target_fields'][3]
N = 3
    
# Extract labels and reports
corpus_train = extractListFromDic(data['train'], 'clean_document_unked')
y_train = extractListFromDic(data['train'], 'labels', field)

corpus_val = extractListFromDic(data['val'], 'clean_document_unked')
y_val = extractListFromDic(data['val'], 'labels', field)

corpus_test = extractListFromDic(data['test'], 'clean_document_unked')
y_test = extractListFromDic(data['test'], 'labels', field)

# Vectorizer documents
vectorizer = getTrainedVectorizer(corpus_train, N, 1)  
X_train = vectorizer.transform(corpus_train)
X_val = vectorizer.transform(corpus_val)
X_test = vectorizer.transform(corpus_test)

model = LogisticRegression(penalty = 'l1', class_weight = 'balanced',solver = 'liblinear')
clf = RandomizedSearchCV(model, params, cv=3, n_iter = 20, n_jobs = 20)
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=20,
          param_distributions={'C': array([1.00000e-06, 1.02804e-06, ..., 9.72720e+05, 1.00000e+06])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

# 3. Get output

In [5]:
pred_val = pd.DataFrame({'label': np.array(y_val).astype(str), 
                         'prediction': clf.predict(X_val), 
                         'probability': np.max(clf.predict_proba(X_val), axis=1)})

pred_test = pd.DataFrame({'label': np.array(y_test).astype(str), 
                          'prediction': clf.predict(X_test), 
                          'probability': np.max(clf.predict_proba(X_test), axis=1)})

In [6]:
pred_val.head()

Unnamed: 0,label,prediction,probability
0,3,3,0.996676
1,3,3,0.999946
2,4,4,0.999949
3,4,4,0.999988
4,4,4,0.999999


# 4. Calibrate predictions

In [7]:
classes = clf.classes_

calibrated_scores = np.zeros((pred_test.shape[0], len(classes)))

pred_test['correct'] = pred_test['label'] == pred_test['prediction']
pred_val['correct'] = pred_val['label'] == pred_val['prediction']

probs_val = clf.predict_proba(X_val)
probs_test = clf.predict_proba(X_test)

# Multiclass calibration
for p in range(len(classes)):
    pred_val['correct'] = pred_val['label'] == classes[p]
    pred_val['correct'] = pred_val['correct'].astype(int)

    X = probs_val[:,p].reshape(-1,)
    y = pred_val['correct']

    reg = IsotonicRegression()
    reg.fit(X,y)

    X_eval = probs_test[:,p].reshape(-1,)
    X_eval[X_eval < reg.X_min_] = reg.X_min_
    X_eval[X_eval > reg.X_max_] = reg.X_max_

    calibrated_scores[:,p] = reg.predict(X_eval)

# Normalize calibrated scores
row_sums = calibrated_scores.sum(axis=1)
calibrated_scores = calibrated_scores / row_sums[:, np.newaxis]
calibrated_scores = np.max(calibrated_scores, axis = 1)

# Calculate expected calibration error
ece = ece_mce_error(calibrated_scores, pred_test['prediction'].astype(str), 
                            pred_test['label'].astype(str), num_bins = 10, plot = None)

pred_test['calibrated_score'] = calibrated_scores

[(0.0, 0.1), (0.1, 0.2), (0.2, 0.30000000000000004), (0.30000000000000004, 0.4), (0.4, 0.5), (0.5, 0.6000000000000001), (0.6000000000000001, 0.7000000000000001), (0.7000000000000001, 0.8), (0.8, 0.9), (0.9, 1.0)]
{0.9904697427351317, 0.9874887650548265, 0.5685660613650595, 0.983117737338303, 0.9788227600996259, 0.9918032786885246, 0.9778085991678225, 0.9455460580552784, 0.8, 0.9861432590733963, 0.826164290363952, 0.9916963226571768, 0.9769163721281765, 1.0, 0.9629651388676911, 0.9675851227558498, 0.9764744001344694, 0.5714285714285714, 0.8864192416081454, 0.9360904253705534, 0.6960048088433503, 0.997498159131251, 0.5454545454545454, 0.9725961893963668, 0.5968448729184925, 0.9955565814530556, 0.9387483355525964, 0.8705303674531102, 0.8727652355474357, 0.6277561608300909, 0.9782446919185197, 0.627906976744186, 0.9873262299175892, 0.9559960448425063, 0.9491525423728814, 0.7580880818200794, 0.947017180016858, 0.9824786324786324, 0.9789571445566319, 0.9402910740966188, 0.6724256235460819, 0

In [8]:
print('expected calibration error:', ece[0])

expected calibration error: 0.018677780379608787
