In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import copy
import numpy as np
import pandas as pd
import pickle
import sys
import torch
import warnings

warnings.filterwarnings('ignore')

basedir = "/media/pathologyhd/path_nlp/pathparsing/"
sys.path.append(basedir + "prostate-open-source/")

from methods.bag_of_ngrams.processing import (cleanReport, cleanReports, cleanSplit, getCounter, 
                                              getTrainedVectorizer, STRIPCHARS, unkReports)
from methods.sklearn_calibration import *
from methods.torch.evaluation import getPredsLabels, getScores
from methods.torch.modeling import runModel
from methods.torch.models import CnnClassifier
from methods.torch.processing import encodeLabels, getEncoder, getTorchLoader, getVocab, reSample
from sklearn.metrics import f1_score, recall_score
from pyfunctions.general import *

# 1. Set up

In [None]:
args = {'domain': 'prostate',
        'epochs': 20,
        'embeddingDim': 300,
        'maxDocLength': 1346,
        'path': basedir + "prostate-open-source/",
        'target_fields': ['TreatmentEffect','TumorType','PrimaryGleason','SecondaryGleason','TertiaryGleason',
                          'SeminalVesicleNone','LymphNodesNone','MarginStatusNone','ExtraprostaticExtension',
                          'PerineuralInfiltration','RbCribriform','BenignMargins'],
        'n_tries': 20 # Number of random search candidates
        }

# Read in data
path = args['path'] + "data/splits/" + args['domain'] + ".json"
data = readJson(path)

# Process reports
data = cleanSplit(data, STRIPCHARS)

# Unk rare words
counter = getCounter(data['train'])
data['train'] = unkReports(data['train'], counter)
data['val'] = unkReports(data['val'], counter)
data['test'] = unkReports(data['test'], counter)

# Get vocab
vocab = getVocab(data['train'])
args['word2idx'] = {word: i for i, word in enumerate(vocab)}
args['word2idx']['<unk>']= len(vocab)
args['wordDim'] = len(vocab) + 1

# 2. Main

In [None]:
# Random search parameters
params = { 'lr': np.logspace(-6,-1,1000),
        'filterNum': [50, 100, 150, 200, 250, 300, 400],
        'dropOut': [0, 0.125, 0.25, 0.5],
       'filters': [[3], [3,4], [4], [4,5], [5], [5,6], [6]],
        'filter_ind': [0,1,2,3,4,5,6],
        'batch_size' : [16, 32],  
        'epochs': [50]      
       }

field = args['target_fields'][2]
    
# Encode labels into 0, 1, 2 values
encoder = getEncoder(data['train'] + data['val'] + data['test'], field)

data['train'] = encodeLabels(data['train'], encoder, field)
data['val'] = encodeLabels(data['val'], encoder, field)
data['test'] = encodeLabels(data['test'], encoder, field)
data['dev_test'] = encodeLabels(data['test'], encoder, field)
args['classSize'] = len(encoder.classes_)

# Extract labels and reports
corpus_train = extractListFromDic(data['train'], 'clean_document_unked')
labels_train = extractListFromDic(data['train'], 'encoded_labels', field)

corpus_val = extractListFromDic(data['val'], 'clean_document_unked')
labels_val = extractListFromDic(data['val'], 'encoded_labels', field)

corpus_test = extractListFromDic(data['test'], 'clean_document_unked')
labels_test = extractListFromDic(data['test'], 'encoded_labels', field)

# Upsample minority classes
corpus_train, labels_train = reSample(corpus_train, labels_train) 

best_args = {'score': 0}

# Loop over number of random search tries
for i in range(args['n_tries']):
    print(i)
    # Set random search parameter configuration       
    args['lr'] = np.random.choice(params['lr'])
    args['dropOut'] = np.random.choice(params['dropOut'])
    args['filters'] = params['filters'][np.random.choice(params['filter_ind'])]
    args['batchSize'] = int(np.random.choice(params['batch_size']))
    args['filterNum'] = [int(np.random.choice(params['filterNum'])/len(args['filters']))]*len(args['filters'])
    args['epochs'] = np.random.choice(params['epochs'])

    # Initialize torch loaders
    trainLoader = getTorchLoader(corpus_train, labels_train, args, shuffle = True)
    valLoader = getTorchLoader(corpus_val, labels_val, args, shuffle = False)
    testLoader = getTorchLoader(corpus_test, labels_test, args, shuffle = False)

    # Train model
    model = CnnClassifier(args)
    model = runModel(model, trainLoader, valLoader, args) 
    val_scores = getScores(model, valLoader, cuda=True)

    if val_scores['f1_weighted'] > best_args['score']:
        best_args['score'] = val_scores['f1_weighted']
        best_args['lr'] = args['lr']
        best_args['dropOut'] = args['dropOut']
        best_args['filterNum'] = args['filterNum']
        best_args['batchSize'] = args['batchSize']
        best_args['filters'] = args['filters']
        best_args['epochs'] = args['epochs']

# Read in best parameters and retrain model
args['lr'] = best_args['lr']
args['dropOut'] = best_args['dropOut']
args['filterNum'] = best_args['filterNum']
args['filters'] = best_args['filters'] 
args['batchSize'] = best_args['batchSize']
args['epochs'] = best_args['epochs']

In [None]:
# Train model with best parameters
model = CnnClassifier(args)
model = runModel(model, trainLoader, valLoader, args)

# Get predictions and labels
preds_test, labels_test, probs_test = getPredsLabels(model, testLoader, probs=True, cuda=True)

In [None]:
pd.DataFrame({'labels': encoder.inverse_transform(labels_test), 
              'predictions': encoder.inverse_transform(preds_test)}).head(5)