In [30]:
# https://arxiv.org/abs/1909.02027
import json
import pandas as pd
import numpy as np
import os
from glob import glob
import tensorflow_hub as hub
from tqdm import tqdm
from time import time

In [5]:
DATA_FOLDER = "../data/clinic/"

In [12]:
files = glob(DATA_FOLDER+ '*/*.csv*')
files

['../data/clinic/data_imbalanced/val.csv',
 '../data/clinic/data_imbalanced/oos_test.csv',
 '../data/clinic/data_imbalanced/test.csv',
 '../data/clinic/data_imbalanced/oos_train.csv',
 '../data/clinic/data_imbalanced/oos_val.csv',
 '../data/clinic/data_imbalanced/train.csv',
 '../data/clinic/data_small/val.csv',
 '../data/clinic/data_small/oos_test.csv',
 '../data/clinic/data_small/test.csv',
 '../data/clinic/data_small/oos_train.csv',
 '../data/clinic/data_small/oos_val.csv',
 '../data/clinic/data_small/train.csv',
 '../data/clinic/data_full/val.csv',
 '../data/clinic/data_full/oos_test.csv',
 '../data/clinic/data_full/test.csv',
 '../data/clinic/data_full/oos_train.csv',
 '../data/clinic/data_full/oos_val.csv',
 '../data/clinic/data_full/train.csv',
 '../data/clinic/data_oos_plus/val.csv',
 '../data/clinic/data_oos_plus/oos_test.csv',
 '../data/clinic/data_oos_plus/test.csv',
 '../data/clinic/data_oos_plus/oos_train.csv',
 '../data/clinic/data_oos_plus/oos_val.csv',
 '../data/clinic/

In [25]:
df_train = pd.read_csv('../data/clinic/data_full/train.csv')
df_train['intent'].unique()

array(['change_accent', 'who_do_you_work_for', 'bill_balance',
       'next_song', 'calories', 'change_user_name', 'confirm_reservation',
       'jump_start', 'card_declined', 'cook_time', 'nutrition_info',
       'greeting', 'calendar', 'schedule_maintenance', 'balance',
       'tire_pressure', 'shopping_list', 'ingredients_list',
       'whisper_mode', 'meal_suggestion', 'travel_alert', 'lost_luggage',
       'weather', 'pin_change', 'pto_request', 'change_speed', 'no',
       'user_name', 'taxes', 'book_flight', 'yes', 'timezone', 'fun_fact',
       'order', 'traffic', 'pay_bill', 'report_fraud', 'vaccines',
       'recipe', 'report_lost_card', 'transfer', 'redeem_rewards',
       'exchange_rate', 'expiration_date', 'order_status',
       'reset_settings', 'cancel_reservation', 'goodbye',
       'restaurant_reviews', 'tell_joke', 'current_location', 'pto_used',
       'international_visa', 'restaurant_suggestion', 'pto_balance',
       'payday', 'flight_status', 'distance', 'routing

In [26]:
labels = {k:i for i,k in enumerate(set(df_train['intent'].tolist()))}

In [27]:
labels

{'meaning_of_life': 0,
 'who_do_you_work_for': 1,
 'pto_request': 2,
 'restaurant_reservation': 3,
 'gas_type': 4,
 'update_playlist': 5,
 'restaurant_suggestion': 6,
 'book_flight': 7,
 'date': 8,
 'mpg': 9,
 'play_music': 10,
 'new_card': 11,
 'timezone': 12,
 'jump_start': 13,
 'schedule_meeting': 14,
 'flight_status': 15,
 'weather': 16,
 'transfer': 17,
 'recipe': 18,
 'reset_settings': 19,
 'schedule_maintenance': 20,
 'international_fees': 21,
 'income': 22,
 'cook_time': 23,
 'redeem_rewards': 24,
 'calories': 25,
 'shopping_list': 26,
 'change_language': 27,
 'alarm': 28,
 'flip_coin': 29,
 'no': 30,
 'whisper_mode': 31,
 'insurance_change': 32,
 'payday': 33,
 'what_are_your_hobbies': 34,
 'damaged_card': 35,
 'definition': 36,
 'insurance': 37,
 'share_location': 38,
 'next_song': 39,
 'report_fraud': 40,
 'vaccines': 41,
 'next_holiday': 42,
 'carry_on': 43,
 'cancel_reservation': 44,
 'what_can_i_ask_you': 45,
 'how_old_are_you': 46,
 'what_song': 47,
 'where_are_you_from'

In [28]:
labels['oos']=150

## ADD USE EMBEDDINGS

In [15]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [16]:
def get_guse_embedings_with_batch(sentences,batch_size=64):
    embeddings = []
    for i in tqdm(range(0,len(sentences),batch_size)):
      embeddings_batch = embed(sentences[i:i+batch_size])
      embeddings.extend(embeddings_batch)
    return embeddings

def add_embeddings(dt, column= 'text'):
    embeddings = get_guse_embedings_with_batch(dt[column])
    embs = np.array(embeddings).tolist()
    df = pd.DataFrame([pd.Series(x) for x in embs])
    df.columns = ['emb_{}'.format(x+1) for x in df.columns]
    dt = pd.concat([dt,df], axis=1).reindex(dt.index)
    return dt

In [29]:
dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
dt = add_embeddings(dt)
dt.head()

100%|██████████| 71/71 [00:10<00:00,  6.71it/s]


Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,set the alarm now,alarm,28,0.004335,0.04446,-0.049194,-0.041635,-0.031951,-0.015406,0.089107,...,0.107635,0.08121,0.005851,0.016944,-0.046487,0.004882,-0.031242,-0.072224,-0.108159,-0.00989
1,please tell me what subjects you like,what_can_i_ask_you,45,0.059729,0.032617,0.046124,-0.002091,0.007193,0.068866,-0.01735,...,-0.023217,-0.049913,0.011365,0.036991,-0.01794,-0.011743,0.018322,0.031345,-0.029549,0.023903
2,is there an uber that drives to the bank on 5t...,uber,107,0.064134,-0.014766,-0.007532,0.059949,0.043445,0.056851,0.052119,...,-0.041092,0.015019,0.085444,-0.068558,0.004263,0.014379,-0.000778,-0.055524,0.028536,0.04911
3,change to something that's not whisper mode,whisper_mode,31,0.04214,0.01914,0.011681,0.033907,0.013733,-0.03596,-0.013485,...,-0.072704,0.053845,-0.028814,0.015518,-0.088805,0.007782,0.03057,0.069152,0.026542,-0.011627
4,"computer, call alexa",make_call,61,0.016042,0.037868,-0.002721,0.032199,0.036126,-0.087309,0.003662,...,-0.064405,0.019637,-0.066277,-0.019167,0.029189,-0.003916,0.017222,-0.047592,0.011032,-0.02706


In [31]:
start_time = time()
for file_name in files:
    dt = pd.read_csv(file_name)
    dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
    add_embeddings(dt).to_csv(file_name.replace('.csv','_with_use_emb.csv'))
time()-start_time

100%|██████████| 47/47 [00:07<00:00,  6.41it/s]
100%|██████████| 16/16 [00:02<00:00,  6.06it/s]
100%|██████████| 71/71 [00:12<00:00,  5.85it/s]
100%|██████████| 2/2 [00:00<00:00,  8.81it/s]
100%|██████████| 2/2 [00:00<00:00,  8.63it/s]
100%|██████████| 165/165 [00:24<00:00,  6.79it/s]
100%|██████████| 47/47 [00:07<00:00,  6.57it/s]
100%|██████████| 16/16 [00:02<00:00,  6.07it/s]
100%|██████████| 71/71 [00:11<00:00,  6.07it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  7.64it/s]
100%|██████████| 118/118 [00:18<00:00,  6.54it/s]
100%|██████████| 47/47 [00:06<00:00,  7.01it/s]
100%|██████████| 16/16 [00:02<00:00,  6.69it/s]
100%|██████████| 71/71 [00:10<00:00,  6.69it/s]
100%|██████████| 2/2 [00:00<00:00,  6.17it/s]
100%|██████████| 2/2 [00:00<00:00,  8.28it/s]
100%|██████████| 235/235 [00:32<00:00,  7.24it/s]
100%|██████████| 47/47 [00:06<00:00,  7.00it/s]
100%|██████████| 16/16 [00:02<00:00,  6.86it/s]
100%|██████████| 71/71 [00:10<00:00,  6.95it/s

2703.8341739177704

In [33]:
files = glob(DATA_FOLDER+ '*/*_with_use_emb.csv')
files

['../data/clinic/data_imbalanced/oos_train_with_use_emb.csv',
 '../data/clinic/data_imbalanced/test_with_use_emb.csv',
 '../data/clinic/data_imbalanced/val_with_use_emb.csv',
 '../data/clinic/data_imbalanced/train_with_use_emb.csv',
 '../data/clinic/data_imbalanced/oos_test_with_use_emb.csv',
 '../data/clinic/data_imbalanced/oos_val_with_use_emb.csv',
 '../data/clinic/data_small/oos_train_with_use_emb.csv',
 '../data/clinic/data_small/test_with_use_emb.csv',
 '../data/clinic/data_small/val_with_use_emb.csv',
 '../data/clinic/data_small/train_with_use_emb.csv',
 '../data/clinic/data_small/oos_test_with_use_emb.csv',
 '../data/clinic/data_small/oos_val_with_use_emb.csv',
 '../data/clinic/data_full/oos_train_with_use_emb.csv',
 '../data/clinic/data_full/test_with_use_emb.csv',
 '../data/clinic/data_full/val_with_use_emb.csv',
 '../data/clinic/data_full/train_with_use_emb.csv',
 '../data/clinic/data_full/oos_test_with_use_emb.csv',
 '../data/clinic/data_full/oos_val_with_use_emb.csv',
 '..

In [39]:
import zipfile
try:
    import zlib
    compression = zipfile.ZIP_DEFLATED
except:
    compression = zipfile.ZIP_STORED


In [41]:
from zipfile import ZipFile

for file_name in files:
    with ZipFile(file_name.replace('.csv','.zip'),'w') as zip:
           zip.write(file_name, compress_type=compression)

In [None]:
from sklearn.neural_network import MLPClassifier

In [45]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow import keras 

In [None]:
mlp_full = MLPClassifier(hidden_layer_sizes=(400), 
                         max_iter=300,
                         activation = 'relu',
                         solver='adam',
                         random_state=1)

In [57]:
def generate_MLP_Model(num_labels=2, 
                         dense_dropout=0.5, 
                         input_size = 512, 
                         hidden_size = 512,
                        hidden_activation = 'tanh',
                         output_activation = 'softmax',
                         num_layers = 2):
    features = Input(shape=(input_size,), name="first")
    hidden = Dropout(dense_dropout)(features)

    for i in range(num_layers):
        name = 'dense{}'.format(i) if i != num_layers - 1 else 'hidden'
        if dense_dropout>0:
            hidden = Dense(units=hidden_size, activation="relu", name=name)(hidden)
            hidden = Dropout(dense_dropout)(hidden)
        else:
            hidden = Dense(units=hidden_size, activation="relu", name=name)(features)

    logits = hidden
    outputs = Dense(units=num_labels, activation=output_activation, name="output_1")(logits)
    model = keras.Model(inputs=features, outputs=outputs)
    model.compile(optimizer='adam',  
                  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                  #optimizer='sgd',  loss='mse',
                  #loss='binary_crossentropy',
                  #metrics=['accuracy'],
                  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
                  #metrics=[keras.metrics.PrecisionAtRecall(recall=0.8)],
                  run_eagerly = False)
    
    return model


In [59]:
mlp_full = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 400, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0)
mlp_full.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
hidden (Dense)               (None, 400)               205200    
_________________________________________________________________
output_1 (Dense)             (None, 151)               60551     
Total params: 265,751
Trainable params: 265,751
Non-trainable params: 0
_________________________________________________________________


In [60]:
mlp_small = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0.1)
mlp_small.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
dropout_10 (Dropout)         (None, 200)               0         
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [61]:
mlp_imbalanced = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0)
mlp_imbalanced.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [125]:
mlp_oss_plus = generate_MLP_Model(num_labels= 151, 
                     num_layers=1, 
                     hidden_size = 200, 
                     hidden_activation = 'tanh',
                     output_activation = 'softmax',
                     dense_dropout=0.1)
mlp_oss_plus.summary()

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 512)               0         
_________________________________________________________________
hidden (Dense)               (None, 200)               102600    
_________________________________________________________________
dropout_13 (Dropout)         (None, 200)               0         
_________________________________________________________________
output_1 (Dense)             (None, 151)               30351     
Total params: 132,951
Trainable params: 132,951
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Data Small Experiment

In [121]:
dirs = glob(DATA_FOLDER +"*/")

['../data/clinic/data_imbalanced/',
 '../data/clinic/data_small/',
 '../data/clinic/data_full/',
 '../data/clinic/data_oos_plus/']

In [63]:
files = glob(DATA_FOLDER+ 'data_small/*_with_use_emb.csv')
files

['../data/clinic/data_small/oos_train_with_use_emb.csv',
 '../data/clinic/data_small/test_with_use_emb.csv',
 '../data/clinic/data_small/val_with_use_emb.csv',
 '../data/clinic/data_small/train_with_use_emb.csv',
 '../data/clinic/data_small/oos_test_with_use_emb.csv',
 '../data/clinic/data_small/oos_val_with_use_emb.csv']

In [102]:
df_train = pd.concat([ pd.read_csv('../data/clinic/data_small/train_with_use_emb.csv'),
                        pd.read_csv('../data/clinic/data_small/oos_train_with_use_emb.csv')]
                    ).reset_index()
df_train.head()

Unnamed: 0.1,index,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,0,tell me the expiration date for my current cre...,expiration_date,66,0.014531,-0.013376,-0.020924,-0.054085,0.047414,...,0.088208,0.012695,0.015281,-0.053102,-0.032048,-0.054841,-0.077261,0.060321,0.012595,0.010952
1,1,1,would you disconnect from my phone,sync_device,64,0.023302,0.037649,0.044546,0.014734,-0.02434,...,0.000245,0.052699,-0.001177,-0.057279,0.054607,0.014677,0.042166,0.024942,-0.028955,0.014179
2,2,2,could you please track my package,order_status,55,0.015873,0.050021,0.086985,0.059008,-0.074861,...,0.044992,-0.026583,0.050293,-0.055511,-0.076384,-0.034246,-0.016082,0.014663,0.010821,0.041905
3,3,3,any travel alerts for canada,travel_alert,128,0.011174,0.054119,-0.004863,0.06358,0.014945,...,0.001516,0.044681,0.155747,0.006182,-0.051993,-0.018311,0.030164,-0.06723,0.000345,-0.005654
4,4,4,i want to report fraudulent activity on my ame...,report_fraud,40,0.020169,0.024297,0.061849,0.087114,0.062906,...,-0.015909,0.014219,0.049547,-0.001585,-0.035708,0.011871,-0.008624,0.025522,-0.034451,-0.011826


In [101]:
len(df_train)

7600

In [103]:
df_valid = pd.concat([ pd.read_csv('../data/clinic/data_small/val_with_use_emb.csv'),
                        pd.read_csv('../data/clinic/data_small/oos_val_with_use_emb.csv')]
                    ).reset_index()
df_valid.head()

Unnamed: 0.1,index,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,0,hey,greeting,57,-0.011563,0.063238,-0.068319,-0.025212,-0.023325,...,-0.046287,-0.008467,-0.021427,0.024754,0.003913,-0.018389,-8.7e-05,-0.032133,-0.013732,-0.048603
1,1,1,put laundry on my chore list,todo_list_update,97,0.015696,-0.021964,0.04478,0.029989,0.043906,...,0.03682,0.043889,-0.055528,-0.007398,-0.068883,-0.04778,-0.01956,-0.013006,0.025329,-0.060718
2,2,2,go into whisper mode,whisper_mode,31,0.014585,0.000869,0.004636,-0.05459,0.010793,...,-0.023386,0.031051,-0.037808,-0.006632,-0.066107,-0.000482,0.026769,0.06402,0.030081,0.008039
3,3,3,when do i need to change my motor oil again,oil_change_when,101,-0.011529,-0.122637,-0.010257,0.088041,0.041229,...,0.049257,0.041907,-0.002899,0.014342,0.029947,0.063522,-0.080513,0.028871,-0.038446,0.005971
4,4,4,what is the insurance plan i am enrolled in,insurance,37,-0.020582,0.007809,0.067927,-0.014423,0.001521,...,-0.064865,-0.003429,0.118396,0.055625,-0.062134,0.03267,-0.065479,0.033913,-0.021524,0.056304


In [104]:
df_test_inscope = pd.read_csv('../data/clinic/data_small/test_with_use_emb.csv')
df_test_inscope.head()

Unnamed: 0.1,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,thanks so much ai,thank_you,102,0.025073,0.015043,0.026919,0.059464,-0.009611,-0.017123,...,0.010038,0.019088,-0.049697,-0.065069,0.010041,-0.050302,-0.04427,-0.045624,-0.065646,0.004654
1,1,i will be traveling to lima alert my bank,travel_notification,108,0.054223,0.063313,0.037354,0.041752,0.04843,-0.083786,...,-0.023788,0.046185,0.053736,-0.017803,-0.025193,-0.027257,-0.018353,-0.00503,-0.039395,-0.03453
2,2,say again please,repeat,80,0.005553,-0.055113,-0.010114,0.025761,-0.040321,-0.043057,...,0.014537,0.025975,0.006717,0.031482,-0.066517,-0.01197,-0.054692,0.013403,-0.029508,0.026939
3,3,what is needed to cook lasagna,ingredients_list,149,0.054684,-0.031352,0.06074,-0.075713,0.053304,0.027382,...,0.018304,0.007477,-0.045377,0.012741,-0.064737,-0.014584,-0.036439,-0.02636,-0.037485,0.022826
4,4,give me the pressure for the tires on my car,tire_pressure,134,-0.035105,-0.106616,0.012198,0.02296,0.012429,-0.003959,...,-0.039068,0.04091,-0.019446,-0.061237,-0.055356,-0.017614,-0.010838,0.033609,-0.015182,-0.06218


In [105]:
df_test_oos = pd.read_csv('../data/clinic/data_small/oos_test_with_use_emb.csv')
df_test_oos.head()

Unnamed: 0.1,Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,0,can i get a sear's appliance repairman,oos,150,0.063763,-0.083389,0.041397,0.029875,0.062481,-0.04817,...,-0.055654,0.009555,0.041753,-0.066075,-0.01099,0.005024,-0.051586,0.021008,0.014396,0.067789
1,1,what do you do if you can't stop vomiting,oos,150,0.06739,-0.049942,0.06052,0.041764,-0.03241,-0.000165,...,0.061177,0.06716,0.002748,0.114446,-0.041907,0.00675,-0.017072,0.002755,0.008661,0.005566
2,2,how many ppm of particulate is in my local water,oos,150,-0.011204,-0.075099,0.056254,0.019079,0.003711,0.032722,...,-0.068538,0.020465,0.032535,-0.04191,-0.042781,0.054235,-0.009878,-0.019642,0.000677,0.033913
3,3,get me a list of divorce attorneys in the new ...,oos,150,-0.067331,0.003313,0.054819,0.050415,0.087959,0.036031,...,-0.051748,0.040872,0.105509,-0.004875,-0.009358,-0.032585,-0.025125,0.038075,-0.066352,-0.024866
4,4,clear my search history,oos,150,-0.040048,-0.057774,0.023027,-0.02853,-0.034821,-0.057833,...,-0.025218,0.004158,-0.024022,-0.011786,0.040685,-0.059307,0.007647,0.015519,-0.001198,0.03813


In [67]:
emb_cols = ['emb'+'_'+str(i+1) for i in range(512)]
emb_cols

['emb_1',
 'emb_2',
 'emb_3',
 'emb_4',
 'emb_5',
 'emb_6',
 'emb_7',
 'emb_8',
 'emb_9',
 'emb_10',
 'emb_11',
 'emb_12',
 'emb_13',
 'emb_14',
 'emb_15',
 'emb_16',
 'emb_17',
 'emb_18',
 'emb_19',
 'emb_20',
 'emb_21',
 'emb_22',
 'emb_23',
 'emb_24',
 'emb_25',
 'emb_26',
 'emb_27',
 'emb_28',
 'emb_29',
 'emb_30',
 'emb_31',
 'emb_32',
 'emb_33',
 'emb_34',
 'emb_35',
 'emb_36',
 'emb_37',
 'emb_38',
 'emb_39',
 'emb_40',
 'emb_41',
 'emb_42',
 'emb_43',
 'emb_44',
 'emb_45',
 'emb_46',
 'emb_47',
 'emb_48',
 'emb_49',
 'emb_50',
 'emb_51',
 'emb_52',
 'emb_53',
 'emb_54',
 'emb_55',
 'emb_56',
 'emb_57',
 'emb_58',
 'emb_59',
 'emb_60',
 'emb_61',
 'emb_62',
 'emb_63',
 'emb_64',
 'emb_65',
 'emb_66',
 'emb_67',
 'emb_68',
 'emb_69',
 'emb_70',
 'emb_71',
 'emb_72',
 'emb_73',
 'emb_74',
 'emb_75',
 'emb_76',
 'emb_77',
 'emb_78',
 'emb_79',
 'emb_80',
 'emb_81',
 'emb_82',
 'emb_83',
 'emb_84',
 'emb_85',
 'emb_86',
 'emb_87',
 'emb_88',
 'emb_89',
 'emb_90',
 'emb_91',
 'emb_92

In [111]:
from tensorflow.keras import callbacks
from sklearn.metrics import classification_report

earlystopping = callbacks.EarlyStopping(monitor ="val_loss", 
                                            mode ="min", patience = 20, 
                                            restore_best_weights = True)

In [112]:
mlp_small.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = 100, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=1)

Train on 7600 samples, validate on 3100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


<tensorflow.python.keras.callbacks.History at 0x7fb5d5b85320>

In [106]:
mlp_small.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = 1, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=1)

Train on 7600 samples, validate on 3100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


<tensorflow.python.keras.callbacks.History at 0x7fb5d43a2668>

## Inscope Benchmarks

In [113]:
X,y = df_test_inscope[emb_cols].values, df_test_inscope['label'].values
pred_probs = mlp_small.predict(X)
preds = [np. argmax(p) for p in pred_probs]
print(classification_report(y, preds,digits=4))

              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.8667    0.9286        30
           2     1.0000    0.9000    0.9474        30
           3     0.9355    0.9667    0.9508        30
           4     1.0000    1.0000    1.0000        30
           5     0.9643    0.9000    0.9310        30
           6     0.9667    0.9667    0.9667        30
           7     1.0000    0.9667    0.9831        30
           8     1.0000    0.9667    0.9831        30
           9     1.0000    1.0000    1.0000        30
          10     0.9032    0.9333    0.9180        30
          11     0.9667    0.9667    0.9667        30
          12     1.0000    0.9333    0.9655        30
          13     1.0000    1.0000    1.0000        30
          14     0.8824    1.0000    0.9375        30
          15     1.0000    1.0000    1.0000        30
          16     0.9667    0.9667    0.9667        30
          17     1.0000    

## OutofScope Benchmarks

In [114]:
X,y = df_test_oos[emb_cols].values, df_test_oos['label'].values
pred_probs = mlp_small.predict(X)
preds = [np. argmax(p) for p in pred_probs]
print(classification_report(y, preds,digits=4))

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           2     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          14     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          25     0.0000    0.0000    0.0000         0
          26     0.0000    

# Automate All

In [123]:
dirs = sorted(glob(DATA_FOLDER +"*/"))
dirs


['../data/clinic/data_full/',
 '../data/clinic/data_imbalanced/',
 '../data/clinic/data_oos_plus/',
 '../data/clinic/data_small/']

In [126]:
models = [mlp_full, mlp_imbalanced, mlp_oss_plus, mlp_small]

In [127]:
batch_sizes =[1, 64,16,1]

In [135]:
def print_clf_report(model, dt):
    X,y = dt[emb_cols].values, dt['label'].values
    pred_probs = model.predict(X)
    preds = [np. argmax(p) for p in pred_probs]
    print(classification_report(y, preds,digits=4))

def train_evaulate_model(directory, model, batch_size=1): 
    print(directory)
    df_train = pd.concat([ pd.read_csv(directory + 'train_with_use_emb.csv'),
                           pd.read_csv(directory + 'oos_train_with_use_emb.csv')]
                    ).reset_index()
    
    df_valid = pd.concat([ pd.read_csv(directory + 'val_with_use_emb.csv'),
                           pd.read_csv(directory + 'oos_val_with_use_emb.csv')]
                    ).reset_index()
    df_test_inscope = pd.read_csv(directory + 'test_with_use_emb.csv')
    df_test_oos = pd.read_csv(directory + 'oos_test_with_use_emb.csv')
    
    model.fit(df_train[emb_cols].values, 
              df_train['label'].values,
              batch_size = batch_size, 
              epochs=100, 
              validation_data=(df_valid[emb_cols].values, df_valid['label'].values),
              callbacks =[earlystopping],            
              verbose=0)
   
    print('Inscope')
    print_clf_report(model, df_test_inscope)
    
    print('Out of Scope')
    print_clf_report(model, df_test_oos)
   


In [136]:
start = time()
train_evaulate_model(dirs[0], models[0], batch_sizes[0])
time()- start

../data/clinic/data_full/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9000    0.9474        30
           2     1.0000    0.9000    0.9474        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9667    0.9667    0.9667        30
           6     1.0000    0.9333    0.9655        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    0.9000    0.9474        30
           9     0.9091    1.0000    0.9524        30
          10     0.9000    0.9000    0.9000        30
          11     0.8788    0.9667    0.9206        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8333    1.0000    0.9091        30
          15     1.0000    1.0000    1.0000        30
          16     0.9677    1.0000    0.9836    

  'recall', 'true', average, warn_for)


376.9124836921692

In [137]:
start = time()
train_evaulate_model(dirs[1], models[1], batch_sizes[1])
time()- start

../data/clinic/data_imbalanced/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     0.9286    0.8667    0.8966        30
           2     1.0000    0.8667    0.9286        30
           3     0.9062    0.9667    0.9355        30
           4     1.0000    1.0000    1.0000        30
           5     0.9655    0.9333    0.9492        30
           6     1.0000    0.9000    0.9474        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    1.0000    1.0000        30
           9     0.9677    1.0000    0.9836        30
          10     0.9032    0.9333    0.9180        30
          11     0.8485    0.9333    0.8889        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8750    0.9333    0.9032        30
          15     0.9375    1.0000    0.9677        30
          16     0.9630    0.8667    0.91

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           8     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          13     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          19     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          24     0.0000    

  'recall', 'true', average, warn_for)


14.265594959259033

In [138]:
start = time()
train_evaulate_model(dirs[2], models[2], batch_sizes[2])
time()- start

../data/clinic/data_oos_plus/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9333    0.9655        30
           2     0.9667    0.9667    0.9667        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9655    0.9333    0.9492        30
           6     1.0000    0.9333    0.9655        30
           7     1.0000    1.0000    1.0000        30
           8     1.0000    0.9667    0.9831        30
           9     0.9677    1.0000    0.9836        30
          10     0.8750    0.9333    0.9032        30
          11     0.8788    0.9667    0.9206        30
          12     1.0000    0.9667    0.9831        30
          13     1.0000    1.0000    1.0000        30
          14     0.8824    1.0000    0.9375        30
          15     0.9375    1.0000    0.9677        30
          16     0.9677    1.0000    0.9836

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         0
           2     0.0000    0.0000    0.0000         0
           3     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         0
           6     0.0000    0.0000    0.0000         0
           7     0.0000    0.0000    0.0000         0
           9     0.0000    0.0000    0.0000         0
          10     0.0000    0.0000    0.0000         0
          11     0.0000    0.0000    0.0000         0
          16     0.0000    0.0000    0.0000         0
          17     0.0000    0.0000    0.0000         0
          18     0.0000    0.0000    0.0000         0
          19     0.0000    0.0000    0.0000         0
          20     0.0000    0.0000    0.0000         0
          22     0.0000    0.0000    0.0000         0
          23     0.0000    0.0000    0.0000         0
          24     0.0000    0.0000    0.0000         0
          25     0.0000    

  'recall', 'true', average, warn_for)


40.016282081604004

In [139]:
start = time()
train_evaulate_model(dirs[3], models[3], batch_sizes[3])
time()- start

../data/clinic/data_small/
Inscope
              precision    recall  f1-score   support

           0     1.0000    0.9667    0.9831        30
           1     1.0000    0.9000    0.9474        30
           2     1.0000    0.9000    0.9474        30
           3     0.9667    0.9667    0.9667        30
           4     1.0000    1.0000    1.0000        30
           5     0.9643    0.9000    0.9310        30
           6     0.9667    0.9667    0.9667        30
           7     1.0000    0.9667    0.9831        30
           8     1.0000    0.9667    0.9831        30
           9     1.0000    1.0000    1.0000        30
          10     0.8788    0.9667    0.9206        30
          11     0.9375    1.0000    0.9677        30
          12     1.0000    0.9333    0.9655        30
          13     1.0000    1.0000    1.0000        30
          14     0.8108    1.0000    0.8955        30
          15     0.9375    1.0000    0.9677        30
          16     0.9091    1.0000    0.9524   

192.43939208984375

# Summary of Results