In [30]:
# https://arxiv.org/abs/1909.02027
import json
import pandas as pd
import numpy as np
import os
from glob import glob
import tensorflow_hub as hub
from tqdm import tqdm
from time import time

In [5]:
DATA_FOLDER = "../data/clinic/"

In [12]:
files = glob(DATA_FOLDER+ '*/*.csv*')
files

['../data/clinic/data_imbalanced/val.csv',
 '../data/clinic/data_imbalanced/oos_test.csv',
 '../data/clinic/data_imbalanced/test.csv',
 '../data/clinic/data_imbalanced/oos_train.csv',
 '../data/clinic/data_imbalanced/oos_val.csv',
 '../data/clinic/data_imbalanced/train.csv',
 '../data/clinic/data_small/val.csv',
 '../data/clinic/data_small/oos_test.csv',
 '../data/clinic/data_small/test.csv',
 '../data/clinic/data_small/oos_train.csv',
 '../data/clinic/data_small/oos_val.csv',
 '../data/clinic/data_small/train.csv',
 '../data/clinic/data_full/val.csv',
 '../data/clinic/data_full/oos_test.csv',
 '../data/clinic/data_full/test.csv',
 '../data/clinic/data_full/oos_train.csv',
 '../data/clinic/data_full/oos_val.csv',
 '../data/clinic/data_full/train.csv',
 '../data/clinic/data_oos_plus/val.csv',
 '../data/clinic/data_oos_plus/oos_test.csv',
 '../data/clinic/data_oos_plus/test.csv',
 '../data/clinic/data_oos_plus/oos_train.csv',
 '../data/clinic/data_oos_plus/oos_val.csv',
 '../data/clinic/

In [25]:
df_train = pd.read_csv('../data/clinic/data_full/train.csv')
df_train['intent'].unique()

array(['change_accent', 'who_do_you_work_for', 'bill_balance',
       'next_song', 'calories', 'change_user_name', 'confirm_reservation',
       'jump_start', 'card_declined', 'cook_time', 'nutrition_info',
       'greeting', 'calendar', 'schedule_maintenance', 'balance',
       'tire_pressure', 'shopping_list', 'ingredients_list',
       'whisper_mode', 'meal_suggestion', 'travel_alert', 'lost_luggage',
       'weather', 'pin_change', 'pto_request', 'change_speed', 'no',
       'user_name', 'taxes', 'book_flight', 'yes', 'timezone', 'fun_fact',
       'order', 'traffic', 'pay_bill', 'report_fraud', 'vaccines',
       'recipe', 'report_lost_card', 'transfer', 'redeem_rewards',
       'exchange_rate', 'expiration_date', 'order_status',
       'reset_settings', 'cancel_reservation', 'goodbye',
       'restaurant_reviews', 'tell_joke', 'current_location', 'pto_used',
       'international_visa', 'restaurant_suggestion', 'pto_balance',
       'payday', 'flight_status', 'distance', 'routing

In [26]:
labels = {k:i for i,k in enumerate(set(df_train['intent'].tolist()))}

In [27]:
labels

{'meaning_of_life': 0,
 'who_do_you_work_for': 1,
 'pto_request': 2,
 'restaurant_reservation': 3,
 'gas_type': 4,
 'update_playlist': 5,
 'restaurant_suggestion': 6,
 'book_flight': 7,
 'date': 8,
 'mpg': 9,
 'play_music': 10,
 'new_card': 11,
 'timezone': 12,
 'jump_start': 13,
 'schedule_meeting': 14,
 'flight_status': 15,
 'weather': 16,
 'transfer': 17,
 'recipe': 18,
 'reset_settings': 19,
 'schedule_maintenance': 20,
 'international_fees': 21,
 'income': 22,
 'cook_time': 23,
 'redeem_rewards': 24,
 'calories': 25,
 'shopping_list': 26,
 'change_language': 27,
 'alarm': 28,
 'flip_coin': 29,
 'no': 30,
 'whisper_mode': 31,
 'insurance_change': 32,
 'payday': 33,
 'what_are_your_hobbies': 34,
 'damaged_card': 35,
 'definition': 36,
 'insurance': 37,
 'share_location': 38,
 'next_song': 39,
 'report_fraud': 40,
 'vaccines': 41,
 'next_holiday': 42,
 'carry_on': 43,
 'cancel_reservation': 44,
 'what_can_i_ask_you': 45,
 'how_old_are_you': 46,
 'what_song': 47,
 'where_are_you_from'

In [28]:
labels['oos']=150

## ADD USE EMBEDDINGS

In [15]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [16]:
def get_guse_embedings_with_batch(sentences,batch_size=64):
    embeddings = []
    for i in tqdm(range(0,len(sentences),batch_size)):
      embeddings_batch = embed(sentences[i:i+batch_size])
      embeddings.extend(embeddings_batch)
    return embeddings

def add_embeddings(dt, column= 'text'):
    embeddings = get_guse_embedings_with_batch(dt[column])
    embs = np.array(embeddings).tolist()
    df = pd.DataFrame([pd.Series(x) for x in embs])
    df.columns = ['emb_{}'.format(x+1) for x in df.columns]
    dt = pd.concat([dt,df], axis=1).reindex(dt.index)
    return dt

In [29]:
dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
dt = add_embeddings(dt)
dt.head()

100%|██████████| 71/71 [00:10<00:00,  6.71it/s]


Unnamed: 0,text,intent,label,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,emb_512
0,set the alarm now,alarm,28,0.004335,0.04446,-0.049194,-0.041635,-0.031951,-0.015406,0.089107,...,0.107635,0.08121,0.005851,0.016944,-0.046487,0.004882,-0.031242,-0.072224,-0.108159,-0.00989
1,please tell me what subjects you like,what_can_i_ask_you,45,0.059729,0.032617,0.046124,-0.002091,0.007193,0.068866,-0.01735,...,-0.023217,-0.049913,0.011365,0.036991,-0.01794,-0.011743,0.018322,0.031345,-0.029549,0.023903
2,is there an uber that drives to the bank on 5t...,uber,107,0.064134,-0.014766,-0.007532,0.059949,0.043445,0.056851,0.052119,...,-0.041092,0.015019,0.085444,-0.068558,0.004263,0.014379,-0.000778,-0.055524,0.028536,0.04911
3,change to something that's not whisper mode,whisper_mode,31,0.04214,0.01914,0.011681,0.033907,0.013733,-0.03596,-0.013485,...,-0.072704,0.053845,-0.028814,0.015518,-0.088805,0.007782,0.03057,0.069152,0.026542,-0.011627
4,"computer, call alexa",make_call,61,0.016042,0.037868,-0.002721,0.032199,0.036126,-0.087309,0.003662,...,-0.064405,0.019637,-0.066277,-0.019167,0.029189,-0.003916,0.017222,-0.047592,0.011032,-0.02706


In [31]:
start_time = time()
for file_name in files:
    dt = pd.read_csv(file_name)
    dt['label'] = dt['intent'].apply(lambda x: labels.get(x,150))
    add_embeddings(dt).to_csv(file_name.replace('.csv','_with_use_emb.csv'))
time()-start_time

100%|██████████| 47/47 [00:07<00:00,  6.41it/s]
100%|██████████| 16/16 [00:02<00:00,  6.06it/s]
100%|██████████| 71/71 [00:12<00:00,  5.85it/s]
100%|██████████| 2/2 [00:00<00:00,  8.81it/s]
100%|██████████| 2/2 [00:00<00:00,  8.63it/s]
100%|██████████| 165/165 [00:24<00:00,  6.79it/s]
100%|██████████| 47/47 [00:07<00:00,  6.57it/s]
100%|██████████| 16/16 [00:02<00:00,  6.07it/s]
100%|██████████| 71/71 [00:11<00:00,  6.07it/s]
100%|██████████| 2/2 [00:00<00:00,  8.89it/s]
100%|██████████| 2/2 [00:00<00:00,  7.64it/s]
100%|██████████| 118/118 [00:18<00:00,  6.54it/s]
100%|██████████| 47/47 [00:06<00:00,  7.01it/s]
100%|██████████| 16/16 [00:02<00:00,  6.69it/s]
100%|██████████| 71/71 [00:10<00:00,  6.69it/s]
100%|██████████| 2/2 [00:00<00:00,  6.17it/s]
100%|██████████| 2/2 [00:00<00:00,  8.28it/s]
100%|██████████| 235/235 [00:32<00:00,  7.24it/s]
100%|██████████| 47/47 [00:06<00:00,  7.00it/s]
100%|██████████| 16/16 [00:02<00:00,  6.86it/s]
100%|██████████| 71/71 [00:10<00:00,  6.95it/s

2703.8341739177704

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_full = MLPClassifier(hidden_layer_sizes=(400), 
                         max_iter=300,
                         activation = 'relu',
                         solver='adam',
                         random_state=1)

In [None]:
def generate_MLPClassifier(hidden_dim_size, drop_out):
    keras_model = keras.models.Sequential([
        keras.layers.InputLayer(input_shape=(512,), name ='input'),
        keras.layers.Dense(512, activation='relu', name ='hidden1'),
        keras.layers.Dense(2,  name ='output')
    ])
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    keras_model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
    return keras_model

keras_model = generate_keras_model_without_drop_out()
keras_model.summary()

In [4]:
model_name = "bert-base-uncased"
max_length = 512

In [7]:
#! pip install ipywidgets

In [6]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [20]:
from transformers import BertTokenizerFast, BertForSequenceClassification

In [29]:
train_embeds = tokenizer(train.text.tolist(), truncation=True, padding=True, max_length=max_length)


In [30]:
valid_embeds = tokenizer(valid.text.tolist(), truncation=True, padding=True, max_length=max_length)


In [31]:
test_embeds = tokenizer(test.text.tolist(), truncation=True, padding=True, max_length=max_length)


In [35]:
model=BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train.intent.tolist())))

from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=16,  # batch size per device during training
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    logging_steps=200,
    evaluation_strategy="steps",
    output_dir = 'output_file'
)
training_args = TrainingArguments("test-trainer")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [44]:
#!pip install -U sentence-transformers
!conda install -c conda-forge sentence-transformers


Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::ptyprocess==0.7.0=pyhd3eb1b0_2
  - defaults/osx-64::entrypoints==0.3=py36_0
  - defaults/noarch::jupyter_client==7.1.0=pyhd3eb1b0_0
  - defaults/osx-64::webencodings==0.5.1=py36_1
  - defaults/osx-64::jsonschema==3.0.2=py36_0
  - defaults/osx-64::pandocfilters==1.4.3=py36hecd8cb5_1
  - defaults/osx-64::python==3.6.13=h88f2d9e_0
  - defaults/osx-64::pyzmq==22.2.1=py36h23ab428_1
  - defaults/osx-64::numpy==1.17.0=py36h926163e_0
  - defaults/noarch::defusedxml==0.7.1=pyhd3eb1b0_0
  - defaults/noarch::decorator==5.1.0=pyhd3e

In [45]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

ImportError: cannot import name 'HF_MODULES_CACHE'

In [37]:
from transformers import AutoTokenizer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
trainer = Trainer(model=model,
                  args=training_args, 
                  train_dataset=train_embeds,
                  eval_dataset=valid_embeds,
                   data_collator=data_collator,
                tokenizer=tokenizer,
                )
trainer.train()

AttributeError: 'list' object has no attribute 'keys'

In [11]:
for k in data.keys():
    print(k)

oos_val
val
train
oos_test
test
oos_train


In [7]:
dt = pd.DataFrame(data['oos_train'], columns = ['text', 'intent'])
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    100 non-null    object
 1   intent  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [8]:
dt.head()

Unnamed: 0,text,intent
0,how much is an overdraft fee for bank,oos
1,why are exponents preformed before multiplicat...,oos
2,what size wipers does this car take,oos
3,where is the dipstick,oos
4,how much is 1 share of aapl,oos


In [9]:
for file_name in files:
    data = json.load(open(DATA_FOLDER + file_name))
    new_path = DATA_FOLDER + file_name.replace('.json','')
    os.makedirs(new_path, exist_ok=True)
    for key in data.keys():
        dt = pd.DataFrame(data[key], columns = ['text', 'intent'])
        print(file_name,key, len(dt))
        dt.sample(frac=1).to_csv(os.path.join(new_path, key +".csv"), index=False)

data_full.json oos_val 100
data_full.json val 3000
data_full.json train 15000
data_full.json oos_test 1000
data_full.json test 4500
data_full.json oos_train 100
data_imbalanced.json oos_val 100
data_imbalanced.json val 3000
data_imbalanced.json train 10525
data_imbalanced.json oos_test 1000
data_imbalanced.json test 4500
data_imbalanced.json oos_train 100
data_oos_plus.json oos_val 100
data_oos_plus.json val 3000
data_oos_plus.json train 15000
data_oos_plus.json oos_test 1000
data_oos_plus.json test 4500
data_oos_plus.json oos_train 250
data_small.json oos_val 100
data_small.json val 3000
data_small.json train 7500
data_small.json oos_test 1000
data_small.json test 4500
data_small.json oos_train 100
