In [None]:
import math
from multiprocessing import Pool
import os
import sys
import itertools
import time
import tempfile
import pickle

sys.path.append(os.path.abspath(os.pardir))

%matplotlib inline

import keras
import numpy as np
import pandas as pd
import seaborn as sns

# Metrics
from sklearn.metrics import accuracy_score

# Notebook helper methods
from bella import notebook_helper
# Models
from bella.models.tdlstm import LSTM, TDLSTM, TCLSTM
# Tokenisers
from bella.tokenisers import ark_twokenize
# Word Vectors
from bella.word_vectors import PreTrained, GloveCommonCrawl
# Get the data
from bella.parsers import semeval_14, dong, election
from bella.data_types import TargetCollection
from bella.helper import read_config, full_path
from bella.evaluation import evaluation_results
from bella.notebook_helper import get_json_data, write_json_data

In [None]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
mitchel_train = semeval_14(full_path(read_config('mitchel_train')))
mitchel_test = semeval_14(full_path(read_config('mitchel_test')))


dataset_train_test = {'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test),
                      'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
                      'Dong Twitter' : (dong_train, dong_test),
                      'Election Twitter' : (election_train, election_test),
                      'YouTuBean' : (youtubean_train, youtubean_test),
                      'Mitchel' : (mitchel_train, mitchel_test)
                     }

In [None]:
# Loading the word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
glove_300 = GloveCommonCrawl(version=42)
# Word vectors that we are searching over
word_vectors = [sswe, glove_300]


# This is required as we have 3 classes and one of them is -1 and when one hot encoded
# the index of -1 is 2 and that is what it thinks the label is when it should be 
# -1 hence the sentiment mapper
sentiment_mapper = {0 : 0, 1 : 1, 2 : -1}

# Folder to store all the sub folder for each model
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'TDLstm'))
# Folder to store all of the saved models (model zoo folder)
model_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'model zoo'))
os.makedirs(model_dir, exist_ok=True)

In [None]:
def dataset_predictions(train, test, dataset_name, model_class, 
                        word_vector_file_path, result_file_path,
                        model_folder_path, model_params):
    
    print('{} {}'.format(dataset_name, model_params))

    data_train = train.data_dict()
    y_train = train.sentiment_data()
    data_test = test.data_dict()
    y_test = test.sentiment_data()
    
    # Fits the model
    word_vector_data = get_json_data(word_vector_file_path, dataset_name)
    best_score = 0
    best_word_vector = None
    best_model = None
    for word_vector in word_vectors:
        print(word_vector)
        word_vector_name = '{}'.format(word_vector)
        if word_vector_name in word_vector_data:
            word_vec_val_score = word_vector_data[word_vector_name]
            if word_vec_val_score > best_score:
                best_score = word_vec_val_score
                best_word_vector = word_vector
            continue
        model_params['embeddings'] = word_vector
        model = model_class(**model_params)
        print('{} {}'.format(model_params, word_vector))
        history = model.fit(data_train, y_train, validation_size=0.3, verbose=1,
                            reproducible=True, patience=10, epochs=300, org_initialisers=True)
        word_vec_val_score = max(history.history['val_acc'])
        word_vector_data[word_vector_name] = word_vec_val_score
        if word_vec_val_score > best_score:
                best_score = word_vec_val_score
                best_word_vector = word_vector
                best_model = model
                
        # Save word vector validation score result
        write_json_data(word_vector_file_path, dataset_name, word_vector_data)
    if best_word_vector is None:
        raise ValueError('best word vector should not be None')
    if best_model is None:
        model_params['embeddings'] = best_word_vector
        model = model_class(**model_params)
        print('{} {}'.format(model_params, best_word_vector))
        model.fit(data_train, y_train, validation_size=0.3, verbose=1,
                  reproducible=True, patience=10, epochs=300, org_initialisers=True)
    # Saves the model to the model zoo
    model_folder_join = lambda file_name: os.path.join(model_folder_path, file_name)
    model_arch_fp = model_folder_join('{} {} architecture'.format(model, dataset_name))
    model_weights_fp = model_folder_join('{} {} weights'.format(model, dataset_name))
    model.save_model(model_arch_fp, model_weights_fp, verbose=1)
    
    # Predicts on the test data
    predicted_values = model.predict(data_test)
    # Convert prediction from one hot encoded to category value e.g. -1, 0, 1
    predicted_values_cats =  model.prediction_to_cats(y_test, predicted_values, 
                                                      mapper=sentiment_mapper)
    # Evaluates the predictions and save the results
    return evaluation_results(predicted_values_cats, test, dataset_name, 
                              file_name=result_file_path, 
                              save_raw_data=True, re_write=True)

# Mass Evaluation of the LSTM model

In [None]:
# Model folder results
lstm_folder = os.path.join(result_folder, 'lstm')
os.makedirs(lstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(lstm_folder, 'word vector results.json')
result_file = os.path.join(lstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, LSTM, 
                        word_vector_file, result_file, model_dir, model_params)

# Mass Evaluation of the TDLSTM model

In [None]:
# Model folder results
tdlstm_folder = os.path.join(result_folder, 'tdlstm')
os.makedirs(tdlstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(tdlstm_folder, 'word vector results.json')
result_file = os.path.join(tdlstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, TDLSTM,
                        word_vector_file, result_file, model_dir, model_params)

# Mass Evaluation of the TCLSTM model

In [None]:
# Model folder results
tclstm_folder = os.path.join(result_folder, 'tclstm')
os.makedirs(tclstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(tclstm_folder, 'word vector results.json')
result_file = os.path.join(tclstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, TCLSTM, 
                        word_vector_file, result_file, model_dir, model_params)