In [1]:
# https://arxiv.org/abs/1909.02027
import json
import pandas as pd
import numpy as np
import os
from glob import glob
import tensorflow_hub as hub
from tqdm import tqdm
from time import time

In [2]:
DATA_FOLDER = "../data/clinic/"

In [3]:
files = glob(DATA_FOLDER+ '*/*.csv*')
files

['../data/clinic/data_imbalanced/oos_train_with_use_emb.csv',
 '../data/clinic/data_imbalanced/test_with_use_emb.csv',
 '../data/clinic/data_imbalanced/val_with_use_emb.csv',
 '../data/clinic/data_imbalanced/val.csv',
 '../data/clinic/data_imbalanced/oos_test.csv',
 '../data/clinic/data_imbalanced/test.csv',
 '../data/clinic/data_imbalanced/oos_train.csv',
 '../data/clinic/data_imbalanced/train_with_use_emb.csv',
 '../data/clinic/data_imbalanced/oos_test_with_use_emb.csv',
 '../data/clinic/data_imbalanced/oos_val.csv',
 '../data/clinic/data_imbalanced/train.csv',
 '../data/clinic/data_imbalanced/oos_val_with_use_emb.csv',
 '../data/clinic/data_small/oos_train_with_use_emb.csv',
 '../data/clinic/data_small/test_with_use_emb.csv',
 '../data/clinic/data_small/val_with_use_emb.csv',
 '../data/clinic/data_small/val.csv',
 '../data/clinic/data_small/oos_test.csv',
 '../data/clinic/data_small/test.csv',
 '../data/clinic/data_small/oos_train.csv',
 '../data/clinic/data_small/train_with_use_emb

In [5]:
df_train = pd.read_csv('../data/clinic/data_full/train.csv')
df_train['intent'].unique()

array(['change_accent', 'who_do_you_work_for', 'bill_balance',
       'next_song', 'calories', 'change_user_name', 'confirm_reservation',
       'jump_start', 'card_declined', 'cook_time', 'nutrition_info',
       'greeting', 'calendar', 'schedule_maintenance', 'balance',
       'tire_pressure', 'shopping_list', 'ingredients_list',
       'whisper_mode', 'meal_suggestion', 'travel_alert', 'lost_luggage',
       'weather', 'pin_change', 'pto_request', 'change_speed', 'no',
       'user_name', 'taxes', 'book_flight', 'yes', 'timezone', 'fun_fact',
       'order', 'traffic', 'pay_bill', 'report_fraud', 'vaccines',
       'recipe', 'report_lost_card', 'transfer', 'redeem_rewards',
       'exchange_rate', 'expiration_date', 'order_status',
       'reset_settings', 'cancel_reservation', 'goodbye',
       'restaurant_reviews', 'tell_joke', 'current_location', 'pto_used',
       'international_visa', 'restaurant_suggestion', 'pto_balance',
       'payday', 'flight_status', 'distance', 'routing

In [6]:
labels = {k:i for i,k in enumerate(set(df_train['intent'].tolist()))}

In [27]:
labels

{'meaning_of_life': 0,
 'who_do_you_work_for': 1,
 'pto_request': 2,
 'restaurant_reservation': 3,
 'gas_type': 4,
 'update_playlist': 5,
 'restaurant_suggestion': 6,
 'book_flight': 7,
 'date': 8,
 'mpg': 9,
 'play_music': 10,
 'new_card': 11,
 'timezone': 12,
 'jump_start': 13,
 'schedule_meeting': 14,
 'flight_status': 15,
 'weather': 16,
 'transfer': 17,
 'recipe': 18,
 'reset_settings': 19,
 'schedule_maintenance': 20,
 'international_fees': 21,
 'income': 22,
 'cook_time': 23,
 'redeem_rewards': 24,
 'calories': 25,
 'shopping_list': 26,
 'change_language': 27,
 'alarm': 28,
 'flip_coin': 29,
 'no': 30,
 'whisper_mode': 31,
 'insurance_change': 32,
 'payday': 33,
 'what_are_your_hobbies': 34,
 'damaged_card': 35,
 'definition': 36,
 'insurance': 37,
 'share_location': 38,
 'next_song': 39,
 'report_fraud': 40,
 'vaccines': 41,
 'next_holiday': 42,
 'carry_on': 43,
 'cancel_reservation': 44,
 'what_can_i_ask_you': 45,
 'how_old_are_you': 46,
 'what_song': 47,
 'where_are_you_from'

In [7]:
labels['oos']=150

## ADD USE EMBEDDINGS

In [8]:
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [9]:
def get_guse_embedings_with_batch(sentences,batch_size=64):
    embeddings = []
    for i in tqdm(range(0,len(sentences),batch_size)):
      embeddings_batch = embed(sentences[i:i+batch_size])
      embeddings.extend(embeddings_batch)
    return embeddings

def add_embeddings(dt, column= 'text'):
    embeddings = get_guse_embedings_with_batch(dt[column])
    embs = np.array(embeddings).tolist()
    df = pd.DataFrame([pd.Series(x) for x in embs])
    df.columns = ['emb_{}'.format(x+1) for x in df.columns]
    dt = pd.concat([dt,df], axis=1).reindex(dt.index)
    return dt

In [10]:
dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
dt = add_embeddings(dt)
dt.head()

NameError: name 'dt' is not defined

In [31]:
start_time = time()
for file_name in files:
    dt = pd.read_csv(file_name)
    dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
    add_embeddings(dt).to_csv(file_name.replace('.csv','_with_use_emb.csv'))
time()-start_time

100%|██████████| 47/47 [00:07<00:00,  6.41it/s]
100%|██████████| 16/16 [00:02<00:00,  6.06it/s]
100%|██████████| 71/71 [00:12<00:00,  5.85it/s]
100%|██████████| 2/2 [00:00<00:00,  8.81it/s]
100%|██████████| 2/2 [00:00<00:00,  8.63it/s]
100%|██████████| 165/165 [00:24<00:00,  6.79it/s]
100%|██████████| 47/47 [00:07<00:00,  6.57it/s]
100%|██████████| 16/16 [00:02<00:00,  6.07it/s]
100%|██████████| 71/71 [00:11<00:00,  6.07it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  7.64it/s]
100%|██████████| 118/118 [00:18<00:00,  6.54it/s]
100%|██████████| 47/47 [00:06<00:00,  7.01it/s]
100%|██████████| 16/16 [00:02<00:00,  6.69it/s]
100%|██████████| 71/71 [00:10<00:00,  6.69it/s]
100%|██████████| 2/2 [00:00<00:00,  6.17it/s]
100%|██████████| 2/2 [00:00<00:00,  8.28it/s]
100%|██████████| 235/235 [00:32<00:00,  7.24it/s]
100%|██████████| 47/47 [00:06<00:00,  7.00it/s]
100%|██████████| 16/16 [00:02<00:00,  6.86it/s]
100%|██████████| 71/71 [00:10<00:00,  6.95it/s

2703.8341739177704

In [11]:
start_time = time()
for file_name in files:
    if file_name.find('with_use_emb')>-1:
        continue
    dt = pd.read_csv(file_name)
    dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
    add_embeddings(dt).to_csv(file_name.replace('.csv','_with_use_emb_not_large.csv'))
time()-start_time

100%|██████████| 47/47 [00:00<00:00, 51.96it/s]
100%|██████████| 16/16 [00:00<00:00, 66.22it/s]
100%|██████████| 71/71 [00:00<00:00, 86.84it/s]
100%|██████████| 2/2 [00:00<00:00, 14.21it/s]
100%|██████████| 2/2 [00:00<00:00, 38.25it/s]
100%|██████████| 165/165 [00:02<00:00, 65.82it/s]
100%|██████████| 47/47 [00:00<00:00, 64.50it/s]
100%|██████████| 16/16 [00:00<00:00, 65.59it/s]
100%|██████████| 71/71 [00:00<00:00, 71.11it/s]
100%|██████████| 2/2 [00:00<00:00, 68.00it/s]
100%|██████████| 2/2 [00:00<00:00, 119.85it/s]
100%|██████████| 118/118 [00:01<00:00, 82.21it/s]
100%|██████████| 47/47 [00:00<00:00, 86.25it/s]
100%|██████████| 16/16 [00:00<00:00, 75.70it/s]
100%|██████████| 71/71 [00:00<00:00, 92.11it/s]
100%|██████████| 2/2 [00:00<00:00, 88.70it/s]
100%|██████████| 2/2 [00:00<00:00, 101.01it/s]
100%|██████████| 235/235 [00:02<00:00, 87.07it/s]
100%|██████████| 47/47 [00:00<00:00, 93.29it/s]
100%|██████████| 16/16 [00:00<00:00, 56.27it/s]
100%|██████████| 71/71 [00:00<00:00, 72.83it

2839.24116396904

In [12]:
files = glob(DATA_FOLDER+ '*/*_with_use_emb_not_large.csv')
files

['../data/clinic/data_imbalanced/oos_test_with_use_emb_not_large.csv',
 '../data/clinic/data_imbalanced/train_with_use_emb_not_large.csv',
 '../data/clinic/data_imbalanced/oos_val_with_use_emb_not_large.csv',
 '../data/clinic/data_imbalanced/val_with_use_emb_not_large.csv',
 '../data/clinic/data_imbalanced/test_with_use_emb_not_large.csv',
 '../data/clinic/data_imbalanced/oos_train_with_use_emb_not_large.csv',
 '../data/clinic/data_small/oos_test_with_use_emb_not_large.csv',
 '../data/clinic/data_small/train_with_use_emb_not_large.csv',
 '../data/clinic/data_small/oos_val_with_use_emb_not_large.csv',
 '../data/clinic/data_small/val_with_use_emb_not_large.csv',
 '../data/clinic/data_small/test_with_use_emb_not_large.csv',
 '../data/clinic/data_small/oos_train_with_use_emb_not_large.csv',
 '../data/clinic/data_full/oos_test_with_use_emb_not_large.csv',
 '../data/clinic/data_full/train_with_use_emb_not_large.csv',
 '../data/clinic/data_full/oos_val_with_use_emb_not_large.csv',
 '../data/c

In [39]:
import zipfile
try:
    import zlib
    compression = zipfile.ZIP_DEFLATED
except:
    compression = zipfile.ZIP_STORED


In [41]:
from zipfile import ZipFile

for file_name in files:
    with ZipFile(file_name.replace('.csv','.zip'),'w') as zip:
           zip.write(file_name, compress_type=compression)

In [None]:
from sklearn.neural_network import MLPClassifier

In [15]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow import keras 

In [None]:
mlp_full = MLPClassifier(hidden_layer_sizes=(400), 
                         max_iter=300,
                         activation = 'relu',
                         solver='adam',
                         random_state=1)

In [16]:
def generate_MLP_Model(num_labels=2, 
                         dense_dropout=0.5, 
                         input_size = 512, 
                         hidden_size = 512,
                        hidden_activation = 'tanh',
                         output_activation = 'softmax',
                         num_layers = 2):
    features = Input(shape=(input_size,), name="first")
    hidden = Dropout(dense_dropout)(features)

    for i in range(num_layers):
        name = 'dense{}'.format(i) if i != num_layers - 1 else 'hidden'
        if dense_dropout>0:
            hidden = Dense(units=hidden_size, activation="relu", name=name)(hidden)
            hidden = Dropout(dense_dropout)(hidden)
        else:
            hidden = Dense(units=hidden_size, activation="relu", name=name)(features)

    logits = hidden
    outputs = Dense(units=num_labels, activation=output_activation, name="output_1")(logits)
    model = keras.Model(inputs=features, outputs=outputs)
    model.compile(optimizer='adam',  
                  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                  #optimizer='sgd',  loss='mse',
                  #loss='binary_crossentropy',
                  #metrics=['accuracy'],
                  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
                  #metrics=[keras.metrics.PrecisionAtRecall(recall=0.8)],
                  run_eagerly = False)
    
    return model


In [17]:
mlp_full = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 400, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0)
mlp_full.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
hidden (Dense)               (None, 400)               205200    
_________________________________________________________________
output_1 (Dense)             (None, 151)               60551     
Total params: 265,751
Trainable params: 265,751
Non-trainable params: 0
_________________________________________________________________


In [18]:
mlp_small = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0.1)
mlp_small.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [19]:
mlp_imbalanced = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0)
mlp_imbalanced.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [20]:
mlp_oss_plus = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0.1)
mlp_oss_plus.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Data Small Experiment

In [23]:
dirs = glob(DATA_FOLDER +"*/")
dirs

['../data/clinic/data_imbalanced/',
 '../data/clinic/data_small/',
 '../data/clinic/data_full/',
 '../data/clinic/data_oos_plus/']

In [25]:
files = glob(DATA_FOLDER+ 'data_small/*_with_use_emb_not_large.csv')
files

['../data/clinic/data_small/oos_test_with_use_emb_not_large.csv',
 '../data/clinic/data_small/train_with_use_emb_not_large.csv',
 '../data/clinic/data_small/oos_val_with_use_emb_not_large.csv',
 '../data/clinic/data_small/val_with_use_emb_not_large.csv',
 '../data/clinic/data_small/test_with_use_emb_not_large.csv',
 '../data/clinic/data_small/oos_train_with_use_emb_not_large.csv']

In [26]:
df_train = pd.concat([ pd.read_csv('../data/clinic/data_small/train_with_use_emb_not_large.csv'),
                        pd.read_csv('../data/clinic/data_small/oos_train_with_use_emb_not_large.csv')]
                    ).reset_index()
df_train.head()

Unnamed: 0.1,index,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,0,tell me the expiration date for my current cre...,expiration_date,10,-0.040155,-0.100772,-0.01397,-0.033594,-0.074708,...,-0.071578,0.071027,0.051464,-0.040543,0.089377,0.053426,0.006784,0.016784,0.013674,0.018969
1,1,1,would you disconnect from my phone,sync_device,7,-0.027262,0.059226,0.03483,-0.007459,0.073154,...,-0.024827,0.034608,-0.022895,0.058841,0.006152,-0.021668,0.052534,-0.002309,-0.016219,-0.020532
2,2,2,could you please track my package,order_status,26,-0.055063,0.01451,-0.011288,0.006235,0.055658,...,-0.048539,0.055937,0.013898,0.021369,-0.053738,-0.047681,0.013829,0.02153,-0.051963,-0.04846
3,3,3,any travel alerts for canada,travel_alert,28,0.022354,0.026428,0.007004,-0.046039,0.045855,...,0.057745,0.057128,0.01982,-0.001276,0.050386,0.043875,0.03154,0.051629,-0.000464,-0.010409
4,4,4,i want to report fraudulent activity on my ame...,report_fraud,105,-0.070291,-0.051348,0.029456,-0.044244,0.04945,...,-0.037034,0.071476,-0.002037,-0.055908,0.028856,0.038823,-0.027568,0.010344,-0.081946,-0.00474


In [101]:
len(df_train)

7600

In [27]:
df_valid = pd.concat([ pd.read_csv('../data/clinic/data_small/val_with_use_emb_not_large.csv'),
                        pd.read_csv('../data/clinic/data_small/oos_val_with_use_emb_not_large.csv')]
                    ).reset_index()
df_valid.head()

Unnamed: 0.1,index,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,0,hey,greeting,18,-0.028977,-0.069464,0.052299,0.033163,0.057072,...,-0.047304,-0.094889,-0.015521,0.05298,0.014295,-0.019171,-0.049758,-0.064394,-0.047327,0.009936
1,1,1,put laundry on my chore list,todo_list_update,109,0.05471,-0.050362,-0.007406,-0.056255,-0.008085,...,0.014306,0.037495,0.05922,0.070494,-0.047755,-0.054949,0.013341,0.027322,0.028261,0.033261
2,2,2,go into whisper mode,whisper_mode,129,-0.014857,0.051608,0.013153,-0.019881,0.013295,...,0.015017,-0.042217,0.073386,-0.002565,0.05192,0.003217,-0.042009,0.070887,0.07549,0.002634
3,3,3,when do i need to change my motor oil again,oil_change_when,46,-0.004057,-0.065462,-0.05703,0.046128,-0.051639,...,0.026361,0.075311,0.016529,-0.046038,0.042125,-0.034304,-0.02336,-0.013787,-0.036103,0.021343
4,4,4,what is the insurance plan i am enrolled in,insurance,23,-0.087778,-0.0734,-0.011642,-0.036968,-0.045147,...,0.034078,0.072586,0.021591,-0.108329,-0.008622,-0.021959,-0.002704,0.079256,0.074996,-0.014043


In [28]:
df_test_inscope = pd.read_csv('../data/clinic/data_small/test_with_use_emb_not_large.csv')
df_test_inscope.head()

Unnamed: 0.1,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,thanks so much ai,thank_you,36,-0.029997,-0.01498,0.061295,-0.030674,0.021879,-0.024469,...,0.013312,-0.00177,0.033333,0.063153,0.062007,0.022718,-0.01332,-0.00757,0.006375,-0.001329
1,1,i will be traveling to lima alert my bank,travel_notification,6,0.036371,-0.010319,0.032173,-0.05923,0.027334,0.054314,...,0.019954,0.076818,0.082983,0.003128,0.034301,0.099494,-0.024311,0.00361,-0.049011,-0.095653
2,2,say again please,repeat,135,-0.038975,-0.006627,-0.043051,0.070269,0.036932,0.040348,...,0.047787,-0.092466,-0.040176,-0.006622,0.040359,-0.041509,0.006387,-0.057521,-0.028157,-0.038858
3,3,what is needed to cook lasagna,ingredients_list,60,-0.053814,-0.061073,0.036123,0.043716,0.050313,0.019721,...,0.000789,0.070016,0.030398,0.05165,-0.026917,0.029005,-0.003356,-0.033767,0.047321,-0.090501
4,4,give me the pressure for the tires on my car,tire_pressure,38,0.012925,-0.014824,0.076381,0.021814,0.054738,-0.006144,...,0.05683,0.071343,0.02966,-0.095485,-0.011102,0.044503,-0.001196,-0.00461,0.017682,0.019733


In [29]:
df_test_oos = pd.read_csv('../data/clinic/data_small/oos_test_with_use_emb_not_large.csv')
df_test_oos.head()

Unnamed: 0.1,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,can i get a sear's appliance repairman,oos,150,0.001829,-0.016941,-0.054595,0.017081,-0.029765,0.070834,...,-0.043084,0.029035,0.03035,0.045751,0.057523,-0.031591,0.023559,-0.036538,0.042396,-0.006072
1,1,what do you do if you can't stop vomiting,oos,150,0.013368,0.040282,0.002469,-0.001204,0.033581,-0.010957,...,-0.006601,0.080698,-0.067142,-0.101796,0.047814,-0.013325,-0.005319,-0.06621,-0.003475,-0.039296
2,2,how many ppm of particulate is in my local water,oos,150,-0.007001,-0.025082,-0.052956,-0.001758,0.020452,-0.012768,...,0.054614,0.085859,0.026262,-0.018662,0.003845,-0.009895,-0.027959,0.063628,-0.027185,-0.058524
3,3,get me a list of divorce attorneys in the new ...,oos,150,0.023987,-0.019148,0.013654,-0.011939,-0.000627,0.023051,...,0.042193,0.062463,0.063075,-0.016516,0.061391,0.053259,0.023495,0.031551,0.062632,0.025739
4,4,clear my search history,oos,150,0.03619,0.066142,-0.01736,0.037186,-0.02008,-0.054559,...,0.08103,-0.034575,0.05074,0.011291,0.019406,0.001155,-0.024586,0.029265,-0.041097,0.017834


In [30]:
emb_cols = ['emb'+'_'+str(i+1) for i in range(512)]
emb_cols

['emb_1',
 'emb_2',
 'emb_3',
 'emb_4',
 'emb_5',
 'emb_6',
 'emb_7',
 'emb_8',
 'emb_9',
 'emb_10',
 'emb_11',
 'emb_12',
 'emb_13',
 'emb_14',
 'emb_15',
 'emb_16',
 'emb_17',
 'emb_18',
 'emb_19',
 'emb_20',
 'emb_21',
 'emb_22',
 'emb_23',
 'emb_24',
 'emb_25',
 'emb_26',
 'emb_27',
 'emb_28',
 'emb_29',
 'emb_30',
 'emb_31',
 'emb_32',
 'emb_33',
 'emb_34',
 'emb_35',
 'emb_36',
 'emb_37',
 'emb_38',
 'emb_39',
 'emb_40',
 'emb_41',
 'emb_42',
 'emb_43',
 'emb_44',
 'emb_45',
 'emb_46',
 'emb_47',
 'emb_48',
 'emb_49',
 'emb_50',
 'emb_51',
 'emb_52',
 'emb_53',
 'emb_54',
 'emb_55',
 'emb_56',
 'emb_57',
 'emb_58',
 'emb_59',
 'emb_60',
 'emb_61',
 'emb_62',
 'emb_63',
 'emb_64',
 'emb_65',
 'emb_66',
 'emb_67',
 'emb_68',
 'emb_69',
 'emb_70',
 'emb_71',
 'emb_72',
 'emb_73',
 'emb_74',
 'emb_75',
 'emb_76',
 'emb_77',
 'emb_78',
 'emb_79',
 'emb_80',
 'emb_81',
 'emb_82',
 'emb_83',
 'emb_84',
 'emb_85',
 'emb_86',
 'emb_87',
 'emb_88',
 'emb_89',
 'emb_90',
 'emb_91',
 'emb_92

In [31]:
from tensorflow.keras import callbacks
from sklearn.metrics import classification_report

earlystopping = callbacks.EarlyStopping(monitor ="val_loss", 
                                            mode ="min", patience = 20, 
                                            restore_best_weights = True)

In [32]:
mlp_small.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = 100, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=1)

Train on 7600 samples, validate on 3100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


<tensorflow.python.keras.callbacks.History at 0x7ffa60d4f358>

In [106]:
mlp_small.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = 1, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=1)

Train on 7600 samples, validate on 3100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


<tensorflow.python.keras.callbacks.History at 0x7fb5d43a2668>

## Inscope Benchmarks

In [33]:
X,y = df_test_inscope[emb_cols].values, df_test_inscope['label'].values
pred_probs = mlp_small.predict(X)
preds = [np. argmax(p) for p in pred_probs]
print(classification_report(y, preds,digits=4))

              precision    recall  f1-score   support

           0     0.8214    0.7667    0.7931        30
           1     0.8571    1.0000    0.9231        30
           2     0.8929    0.8333    0.8621        30
           3     1.0000    1.0000    1.0000        30
           4     1.0000    0.9667    0.9831        30
           5     1.0000    0.9000    0.9474        30
           6     1.0000    1.0000    1.0000        30
           7     1.0000    0.9667    0.9831        30
           8     1.0000    1.0000    1.0000        30
           9     0.8571    1.0000    0.9231        30
          10     0.9355    0.9667    0.9508        30
          11     0.8966    0.8667    0.8814        30
          12     0.9667    0.9667    0.9667        30
          13     1.0000    1.0000    1.0000        30
          14     0.8529    0.9667    0.9062        30
          15     0.8519    0.7667    0.8070        30
          16     0.9310    0.9000    0.9153        30
          17     1.0000    

  'recall', 'true', average, warn_for)


In [34]:
X,y = pd.concat([df_test_inscope,df_test_oos])[emb_cols].values, pd.concat([df_test_inscope,df_test_oos])['label'].values
pred_probs = mlp_small.predict(X)
preds = [np. argmax(p) for p in pred_probs]
print(classification_report(y, preds,digits=4))

              precision    recall  f1-score   support

           0     0.8214    0.7667    0.7931        30
           1     0.7692    1.0000    0.8696        30
           2     0.6098    0.8333    0.7042        30
           3     0.9677    1.0000    0.9836        30
           4     0.9355    0.9667    0.9508        30
           5     1.0000    0.9000    0.9474        30
           6     0.9375    1.0000    0.9677        30
           7     0.7632    0.9667    0.8529        30
           8     1.0000    1.0000    1.0000        30
           9     0.7317    1.0000    0.8451        30
          10     0.9062    0.9667    0.9355        30
          11     0.8966    0.8667    0.8814        30
          12     0.9667    0.9667    0.9667        30
          13     0.8333    1.0000    0.9091        30
          14     0.7838    0.9667    0.8657        30
          15     0.8519    0.7667    0.8070        30
          16     0.8438    0.9000    0.8710        30
          17     0.8788    

## OutofScope Benchmarks

In [114]:
X,y = df_test_oos[emb_cols].values, df_test_oos['label'].values
pred_probs = mlp_small.predict(X)
preds = [np. argmax(p) for p in pred_probs]
print(classification_report(y, preds,digits=4))

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           2     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          14     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          25     0.0000    0.0000    0.0000         0
          26     0.0000    

# Automate All

In [123]:
dirs = sorted(glob(DATA_FOLDER +"*/"))
dirs


['../data/clinic/data_full/',
 '../data/clinic/data_imbalanced/',
 '../data/clinic/data_oos_plus/',
 '../data/clinic/data_small/']

In [126]:
models = [mlp_full, mlp_imbalanced, mlp_oss_plus, mlp_small]

In [127]:
batch_sizes =[1, 64,16,1]

In [135]:
def print_clf_report(model, dt):
    X,y = dt[emb_cols].values, dt['label'].values
    pred_probs = model.predict(X)
    preds = [np. argmax(p) for p in pred_probs]
    print(classification_report(y, preds,digits=4))

def train_evaulate_model(directory, model, batch_size=1): 
    print(directory)
    df_train = pd.concat([ pd.read_csv(directory + 'train_with_use_emb.csv'),
                           pd.read_csv(directory + 'oos_train_with_use_emb.csv')]
                    ).reset_index()
    
    df_valid = pd.concat([ pd.read_csv(directory + 'val_with_use_emb.csv'),
                           pd.read_csv(directory + 'oos_val_with_use_emb.csv')]
                    ).reset_index()
    df_test_inscope = pd.read_csv(directory + 'test_with_use_emb.csv')
    df_test_oos = pd.read_csv(directory + 'oos_test_with_use_emb.csv')
    
    model.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = batch_size, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=0)
   
    print('Inscope')
    print_clf_report(model, df_test_inscope)
    
    print('Out of Scope')
    print_clf_report(model, df_test_oos)
   


In [136]:
start = time()
train_evaulate_model(dirs[0], models[0], batch_sizes[0])
time()- start

../data/clinic/data_full/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9000    0.9474        30
           2     1.0000    0.9000    0.9474        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9667    0.9667    0.9667        30
           6     1.0000    0.9333    0.9655        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    0.9000    0.9474        30
           9     0.9091    1.0000    0.9524        30
          10     0.9000    0.9000    0.9000        30
          11     0.8788    0.9667    0.9206        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8333    1.0000    0.9091        30
          15     1.0000    1.0000    1.0000        30
          16     0.9677    1.0000    0.9836    

  'recall', 'true', average, warn_for)


376.9124836921692

In [137]:
start = time()
train_evaulate_model(dirs[1], models[1], batch_sizes[1])
time()- start

../data/clinic/data_imbalanced/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     0.9286    0.8667    0.8966        30
           2     1.0000    0.8667    0.9286        30
           3     0.9062    0.9667    0.9355        30
           4     1.0000    1.0000    1.0000        30
           5     0.9655    0.9333    0.9492        30
           6     1.0000    0.9000    0.9474        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    1.0000    1.0000        30
           9     0.9677    1.0000    0.9836        30
          10     0.9032    0.9333    0.9180        30
          11     0.8485    0.9333    0.8889        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8750    0.9333    0.9032        30
          15     0.9375    1.0000    0.9677        30
          16     0.9630    0.8667    0.91

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           8     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          13     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          19     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          24     0.0000    

  'recall', 'true', average, warn_for)


14.265594959259033

In [138]:
start = time()
train_evaulate_model(dirs[2], models[2], batch_sizes[2])
time()- start

../data/clinic/data_oos_plus/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9333    0.9655        30
           2     0.9667    0.9667    0.9667        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9655    0.9333    0.9492        30
           6     1.0000    0.9333    0.9655        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    0.9667    0.9831        30
           9     0.9677    1.0000    0.9836        30
          10     0.8750    0.9333    0.9032        30
          11     0.8788    0.9667    0.9206        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8824    1.0000    0.9375        30
          15     0.9375    1.0000    0.9677        30
          16     0.9677    1.0000    0.9836

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           2     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          19     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          24     0.0000    0.0000    0.0000         0
          25     0.0000    

  'recall', 'true', average, warn_for)


40.016282081604004

In [139]:
start = time()
train_evaulate_model(dirs[3], models[3], batch_sizes[3])
time()- start

../data/clinic/data_small/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9000    0.9474        30
           2     1.0000    0.9000    0.9474        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9643    0.9000    0.9310        30
           6     0.9667    0.9667    0.9667        30
           7     1.0000    0.9667    0.9831        30
           8     1.0000    0.9667    0.9831        30
           9     1.0000    1.0000    1.0000        30
          10     0.8788    0.9667    0.9206        30
          11     0.9375    1.0000    0.9677        30
          12     1.0000    0.9333    0.9655        30
          13     1.0000    1.0000    1.0000        30
          14     0.8108    1.0000    0.8955        30
          15     0.9375    1.0000    0.9677        30
          16     0.9091    1.0000    0.9524   

192.43939208984375

# Summary Results