In [None]:
!pip install tqdm tensorflow-hub tensorflow-text

In [None]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import defaultdict

In [None]:
def load_jsonl(filepath):
    with open(filepath, 'r', encoding='utf-8') as json_file:
        json_list = list(json_file)

    jsons = []

    for json_str in json_list:
        jsons.append(json.loads(json_str))

    return jsons

In [None]:
root_dir = "./MASSIVE/"

In [None]:
train_filelist = os.listdir(f"{root_dir}/train_data")
test_filelist = os.listdir(f"{root_dir}/test_data")
len(train_filelist), len(test_filelist)

In [None]:
train_texts = []
train_scenario = []
train_labels = []

for item in train_filelist:
    data = load_jsonl(f"{root_dir}/train_data/{item}")

    for example in tqdm(data, f"loading {item}"):
        train_texts.append(example["utt"])
        train_scenario.append(example["scenario"])
        train_labels.append(example["intent"])

In [None]:
from tensorflow.keras.utils import to_categorical
from pandas import Series

In [None]:
unq = Series(train_labels).unique()
idx = Series(Series(train_labels).unique()).index
mapping = Series(idx, index=unq)
train_labels = to_categorical([mapping[item] for item in train_labels])

In [None]:
unq = Series(train_scenario).unique()
idx = Series(Series(train_scenario).unique()).index
mapping = Series(idx, index=unq)
scenarios = [mapping[item] for item in train_scenario]

In [None]:
train_labels.shape

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Needed for loading universal-sentence-encoder-cmlm/multilingual-preprocess
import numpy as np

preprocessor = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base-br/1", trainable=False)

# english_embeds = encoder(preprocessor(english_sentences))["default"]
# japanese_embeds = encoder(preprocessor(japanese_sentences))["default"]
# italian_embeds = encoder(preprocessor(italian_sentences))["default"]

In [None]:
def dataset_map(texts, scenario, labels):

    return {
        "text": texts,
        "scenario": scenario
    }, labels

In [None]:
BATCH_SIZE = 64

dataset = (tf.data.Dataset.from_tensor_slices((train_texts, scenarios, train_labels)).map(dataset_map)
            .shuffle(buffer_size=1000)
            .batch(BATCH_SIZE))

DATASET_SIZE = len(dataset)

split = 0.8

train_size = int(split * DATASET_SIZE)
val_size = int(1-split * DATASET_SIZE)

train_ds = dataset.take(train_size)
validation_ds = dataset.skip(train_size)

len(train_ds), len(validation_ds)

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from pandas import Series

In [None]:
text_input = layers.Input(shape=(), dtype=tf.string, name="text")
scenario_input = layers.Input(shape=(1), dtype=tf.int32, name="scenario")
scenario_encoder = layers.Embedding(60, 256)(scenario_input)
scenario_encoder = layers.Flatten()(scenario_encoder)

x = preprocessor(text_input)
x = encoder(x)["default"]

x = layers.Concatenate()([x, scenario_encoder])

x = layers.Dense(train_labels.shape[-1], activation='softmax')(x)

model = Model(inputs=[text_input, scenario_input], outputs=x)
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
# ModelCheckpoint: Save the model's weights after every epoch
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='./checkpoints/model_checkpoint.h5', save_best_only=True)
# EarlyStopping: Stop training when a monitored metric has stopped improving
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(train_ds, validation_data=validation_ds, epochs=5, callbacks=[model_checkpoint, early_stopping])