# Imports

In [233]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
#from official.nlp import optimization  # to create AdamW optmizer
import torch
from transformers import BertTokenizer, BertModel, TFBertForSequenceClassification

import codecs
import numpy as np
import seaborn as sns
import pandas as pd
import json as js

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Opening files & Conversion

## Opening

In [2]:
with open('dataset/AskUbuntuCorpus.json') as f:
    askubuntu = js.load(f)

with open('dataset/ChatbotCorpus.json') as f:
    chatbot = js.load(f)

with open('dataset/WebApplicationsCorpus.json') as f:
    webapps = js.load(f)

## Conversion & split

In [225]:
def create_train_test(ds):
    new_dict = {}
    for k,v in [(key,d[key]) for d in ds['sentences'] for key in d]:
        if k not in new_dict: 
            new_dict[k]=[v]
        else: new_dict[k].append(v)
            
    # remove unnecessary keys   
    
    for a in ['author', 'url', 'entities', 'answer']:
        new_dict.pop(a, '')
        
    new_df = pd.DataFrame(new_dict)
    
    #### Splitting train and test
    
    train = new_df.loc[new_df['training'] == True].reset_index(drop = True).drop('training', axis=1)
    
    X = train.text
    y = train.intent
    
    ### Label encoding
    print("Original Labels Train/Val")
    print("-------------------------")
    print(Counter(y))
    le = LabelEncoder()
    l = le.fit_transform(y)
    print("\nEncoded Labels Train/Val")
    print("------------------------")
    print(Counter(l))
    
    X_train, X_val, y_train, y_val = train_test_split(X, l, test_size=0.2, random_state=1)
    
    test = new_df.loc[new_df['training'] != True].reset_index(drop = True).drop('training', axis=1)
    
    X_test = test.text
    y_test = test.intent
    
    ### Label encoding
    print("\nOriginal Labels Test")
    print("--------------------")
    print(Counter(y_test))
    le = LabelEncoder()
    y_test = le.fit_transform(y_test)
    print("\nEncoded Labels Test")
    print("-------------------")
    print(Counter(y_test))
    
    #askubuntu_ds = 
    #askubuntu_df = 

    
    return X_train.to_numpy(), X_test.to_numpy(), X_val.to_numpy(), y_train, y_test, y_val

##### askubuntu
print("\nAskUbuntu Labels")
print("----------------\n")
askubuntu_X_train, askubuntu_X_test, askubuntu_X_val, askubuntu_y_test, askubuntu_y_val = create_train_test(askubuntu)

##### chatbot
print("\n----------------\n")
print("\nChatbot Labels")
print("--------------\n")
chatbot_X_train, chatbot_X_test, chatbot_X_val, chatbot_y_train, chatbot_y_test, chatbot_y_val = create_train_test(chatbot)

##### webapps
print("\n----------------\n")
print("\nWebApps Labels")
print("--------------\n")
webapps_X_train, webapps_X_test, webapps_X_val, webapps_y_train, webapps_y_test, webapps_y_val = create_train_test(webapps)

###### create .csv files

# pd.DataFrame({"text":askubuntu_X_train,"intent": askubuntu_y_train}).to_csv('dataset/askubuntu_train.csv', index=False,header=True)
# pd.DataFrame({"text":askubuntu_X_val,"intent": askubuntu_y_val}).to_csv('dataset/askubuntu_val.csv', index=False,header=True)
# pd.DataFrame({"text":askubuntu_X_test,"intent": askubuntu_y_test}).to_csv('dataset/askubuntu_test.csv', index=False,header=True)
# pd.DataFrame({"text":chatbot_X_train,"intent": chatbot_y_train}).to_csv('dataset/chatbot_train.csv', index=False,header=True)
# pd.DataFrame({"text":chatbot_X_val,"intent": chatbot_y_val}).to_csv('dataset/chatbot_val.csv', index=False,header=True)
# pd.DataFrame({"text":chatbot_X_test,"intent": chatbot_y_test}).to_csv('dataset/chatbot_test.csv', index=False,header=True)
# pd.DataFrame({"text":webapps_X_train,"intent": webapps_y_train}).to_csv('dataset/webapps_train.csv', index=False,header=True)
# pd.DataFrame({"text":webapps_X_val,"intent": webapps_y_val}).to_csv('dataset/webapps_val.csv', index=False,header=True)
# pd.DataFrame({"text":webapps_X_test,"intent": webapps_y_test}).to_csv('dataset/webapps_test.csv', index=False,header=True)


AskUbuntu Labels
----------------

Original Labels Train/Val
-------------------------
Counter({'Software Recommendation': 17, 'Shutdown Computer': 13, 'Make Update': 10, 'Setup Printer': 10, 'None': 3})

Encoded Labels Train/Val
------------------------
Counter({4: 17, 3: 13, 0: 10, 2: 10, 1: 3})

Original Labels Test
--------------------
Counter({'Software Recommendation': 40, 'Make Update': 37, 'Shutdown Computer': 14, 'Setup Printer': 13, 'None': 5})

Encoded Labels Test
-------------------
Counter({4: 40, 0: 37, 3: 14, 2: 13, 1: 5})

----------------


Chatbot Labels
--------------

Original Labels Train/Val
-------------------------
Counter({'FindConnection': 57, 'DepartureTime': 43})

Encoded Labels Train/Val
------------------------
Counter({1: 57, 0: 43})

Original Labels Test
--------------------
Counter({'FindConnection': 71, 'DepartureTime': 35})

Encoded Labels Test
-------------------
Counter({1: 71, 0: 35})

----------------


WebApps Labels
--------------

Original Lab

### AskUbuntu

In [226]:
print("Number of training instances\t", len(askubuntu_X_train))
print("Number of validation instances\t", len(askubuntu_X_val))
print("Number of testing instances\t", len(askubuntu_X_test))

Number of training instances	 42
Number of validation instances	 11
Number of testing instances	 109


In [227]:
print(Counter(askubuntu_y_train))
print(Counter(askubuntu_y_val))
print(Counter(askubuntu_y_test))

Counter({4: 12, 3: 10, 2: 9, 0: 8, 1: 3})
Counter({4: 5, 3: 3, 0: 2, 2: 1})
Counter({4: 40, 0: 37, 3: 14, 2: 13, 1: 5})


### Chatbot

In [164]:
print("Number of training instances\t", len(chatbot_X_train))
print("Number of validation instances\t", len(chatbot_X_val))
print("Number of testing instances\t", len(chatbot_X_test))

Number of training instances	 80
Number of validation instances	 20
Number of testing instances	 106


In [165]:
print(Counter(chatbot_y_train))
print(Counter(chatbot_y_val))
print(Counter(chatbot_y_test))

Counter({1: 49, 0: 31})
Counter({0: 12, 1: 8})
Counter({1: 71, 0: 35})


### webapps

In [166]:
print("Number of training instances\t", len(webapps_X_train))
print("Number of validation instances\t", len(webapps_X_val))
print("Number of testing instances\t", len(webapps_X_test))

Number of training instances	 24
Number of validation instances	 6
Number of testing instances	 59


In [167]:
print(Counter(webapps_y_train))
print(Counter(webapps_y_val))
print(Counter(webapps_y_test))

Counter({1: 7, 4: 4, 5: 3, 7: 3, 6: 2, 0: 2, 3: 2, 2: 1})
Counter({5: 4, 4: 2})
Counter({4: 16, 3: 14, 1: 10, 6: 6, 0: 6, 5: 4, 2: 3})


# BERT & Baseline Models

In [234]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [235]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

## ASkUbuntu

In [194]:
def build_classifier_model(array):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dense(600, activation=tf.nn.relu)(net)
    #net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(len(Counter(array)), activation=tf.nn.softmax, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [195]:
ask_ubuntu_model = build_classifier_model(askubuntu_y_train)

epochs = 5

ask_ubuntu_model.compile(optimizer="adam",
                         loss="sparse_categorical_crossentropy",
                         metrics=["accuracy"])

In [196]:
ask_ubuntu_model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'default': (None, 7 109482241   preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
___________________________________________________________________________________________

In [197]:
print('Training baseline model AskUbuntu')
history_askubuntu = ask_ubuntu_model.fit(x = askubuntu_X_train, y = askubuntu_y_train,
                               validation_data=(askubuntu_X_val, askubuntu_y_val),
                               epochs=epochs)

Training baseline model AskUbuntu
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [198]:
loss, accuracy = ask_ubuntu_model.evaluate(askubuntu_X_test, askubuntu_y_test, batch_size=128)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 1.739715576171875
Accuracy: 0.3669724762439728


In [229]:
dataset_name = 'askubuntu_baseline'
saved_model_path = 'baselines/{}_bert'.format(dataset_name.replace('/', '_'))

ask_ubuntu_model.save(saved_model_path, include_optimizer=False)



## ChatBot

In [190]:
chatbot_model = build_classifier_model(chatbot_y_train)

loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5

chatbot_model.compile(optimizer="adam",
                         loss=loss,
                         metrics=metrics)

In [191]:
chatbot_model.summary()

Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'encoder_outputs':  109482241   preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
___________________________________________________________________________________________

In [192]:
print('Training baseline model Chatbot')
history_chatbot = chatbot_model.fit(x = chatbot_X_train, y = chatbot_y_train,
                               validation_data=(chatbot_X_val, chatbot_y_val),
                               epochs=epochs)

Training baseline model Chatbot
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [193]:
loss, accuracy = chatbot_model.evaluate(chatbot_X_test, chatbot_y_test, batch_size=128)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.721656084060669
Accuracy: 0.5


In [230]:
dataset_name = 'chatbot_baseline'
saved_model_path = 'baselines/{}_bert'.format(dataset_name.replace('/', '_'))

chatbot_model.save(saved_model_path, include_optimizer=False)



## WebApps

In [184]:
webapps_model = build_classifier_model(webapps_y_train)

#loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="auto", name="sparse_categorical_crossentropy")
#metrics = tf.metrics.Accuracy()

epochs = 5

webapps_model.compile(optimizer="adam",
                         loss="sparse_categorical_crossentropy",
                         metrics=["accuracy"])

In [183]:
webapps_model.summary()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'sequence_output':  109482241   preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
___________________________________________________________________________________________

In [185]:
print('Training baseline model WebApps')
history_webapps = webapps_model.fit(x = webapps_X_train, y = webapps_y_train,
                               validation_data=(webapps_X_val, webapps_y_val),
                               epochs=epochs)

Training baseline model WebApps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [186]:
loss, accuracy = webapps_model.evaluate(webapps_X_test, webapps_y_test, batch_size=128)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 3.0454890727996826
Accuracy: 0.2711864411830902


In [231]:
dataset_name = 'webapps_baseline'
saved_model_path = 'baselines/{}_bert'.format(dataset_name.replace('/', '_'))

webapps_model.save(saved_model_path, include_optimizer=False)

