In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

from collections import defaultdict

import pandas as pd
import numpy as np

# Helper functions
from bella.notebook_helper import write_json_data
# Models
from bella.models.tdparse import TDParse, TDParsePlus
# Word Vector methods
from bella.word_vectors import GloveCommonCrawl, PreTrained
# Dependency Parser
from bella import stanford_tools
from bella.dependency_parsers import tweebo, stanford
from bella import tokenisers
# Sentiment lexicons
from bella import lexicons
# Get the data
from bella.parsers import semeval_14, dong, election
from bella.data_types import TargetCollection
from bella.helper import read_config, full_path
# Evaluation methods
from bella.evaluation import evaluation_results, scores, get_results, \
                               save_results, combine_results, get_raw_data

In [4]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))

dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))

mitchel_train = semeval_14(full_path(read_config('mitchel_train')))
mitchel_test = semeval_14(full_path(read_config('mitchel_test')))

dataset_train_test = {'Mitchel' : (mitchel_train, mitchel_test),
                      'YouTuBean' : (youtubean_train, youtubean_test),
                      'Election Twitter' : (election_train, election_test),
                      'Dong Twitter' : (dong_train, dong_test),
                      'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
                      'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test)}

In [5]:
# Get word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
glove_300 = GloveCommonCrawl(version=42)


# Load the sentiment lexicons and remove all words that are not associated
# to the Positive or Negative class.
subset_cats = {'positive', 'negative'}
mpqa_low = lexicons.Mpqa(subset_cats=subset_cats, lower=True)
nrc_low = lexicons.NRC(subset_cats=subset_cats, lower=True)
hu_liu_low = lexicons.HuLiu(subset_cats=subset_cats, lower=True)
mpqa_huliu_low = lexicons.Lexicon.combine_lexicons(mpqa_low, hu_liu_low)
all_three_low = lexicons.Lexicon.combine_lexicons(mpqa_huliu_low, nrc_low)

In [6]:
def dataset_predictions(train, test, dataset_name, model, word_vector, random_state,  
                        c_file_path, word_vector_file_path, model_dir,
                        sentiment_lexicon=None, result_file_path=None,
                        re_write=True, save_raw_data=True):
    if not re_write and result_file_path is not None:
        results_df = get_results(result_file_path, dataset_name)
        if save_raw_data and results_df is not None:
            if get_raw_data(result_file_path, dataset_name, test):
                return results_df
        elif results_df is not None:
            return results_df
    # loading the data
    data_train = train.data()
    y_train = train.sentiment_data()
    data_test = test.data()
    y_test = test.sentiment_data()
    
    
    # Finding the best C value for the model on this dataset
    c_grid_params = {'word_vectors' : [word_vector], 'random_state' : random_state,
                     'parsers' : [tweebo], 'tokenisers' : [tokenisers.ark_twokenize]}
    if sentiment_lexicon is not None:
        c_grid_params['senti_lexicons'] = [sentiment_lexicon]
    best_c, c_scores = model.find_best_c(data_train, y_train, c_grid_params, 
                                         save_file=c_file_path, dataset_name=dataset_name, 
                                         re_write=False, n_jobs=7, cv=5)
    # Search over the different word vectors given the best tokeniser
    # and sentiment lexicon
    word_vectors = [[sswe]]
    word_vector_grid_params = {**c_grid_params}
    word_vector_grid_params['C'] = [best_c]
    word_vector_grid_params['word_vectors'] = word_vectors
    import time
    t = time.time()
    best_word_vector = model.save_grid_search(data_train, y_train, word_vector_grid_params, 
                                              'word_vectors', dataset_name, word_vector_file_path, 
                                              re_write=False, n_jobs=5, cv=5)
    print('{} {}'.format(best_word_vector, time.time() - t))
    t = time.time()
    # Word Vector is too large to multi-process
    word_vectors.extend([[glove_300]])
    best_word_vector = model.save_grid_search(data_train, y_train, word_vector_grid_params, 
                                              'word_vectors', dataset_name, word_vector_file_path, 
                                              re_write=False, n_jobs=1, cv=5)
    print('{} {}'.format(best_word_vector, time.time() - t))
    # Fitting and getting predictions from the model.
    parameters = {'word_vector' : best_word_vector, 'random_state' : random_state, 
                  'C' : best_c, 'tokeniser' : tokenisers.ark_twokenize, 'parser' : tweebo}
    print('Best parameters for dataset {} are: {}'.format(dataset_name, parameters))
    if sentiment_lexicon is not None:
        parameters['senti_lexicon'] = sentiment_lexicon
    best_params = model.get_params(**parameters)
    model.fit(data_train, y_train, params=best_params)
    predicted_values = model.predict(data_test)
    # Save the model to the model zoo
    model_file_name = '{} {}'.format(model, dataset_name)
    model_file_path = os.path.join(model_dir, model_file_name)
    model.save_model(model_file_path, verbose=1)
    # Return the results
    if result_file_path is not None:
        return evaluation_results(predicted_values, test, dataset_name, 
                                  file_name=result_file_path, 
                                  save_raw_data=save_raw_data, re_write=True)
    else:
        return evaluation_results(predicted_values, test, dataset_name)
   

In [7]:
# Instances of the models
tdparse = TDParse()
tdparse_plus = TDParsePlus()
models = [tdparse, tdparse_plus]

In [8]:
# Creating the result files
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'TDParse Models'))
model_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'model zoo'))
os.makedirs(result_folder, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
model_result_files = ['TDParse.tsv', 'TDParsePlus.tsv']
model_result_files = [os.path.join(result_folder, result_file) for result_file in model_result_files]
C_result_files = ['TDParse C.json', 'TDParsePlus C.json']
C_result_files = [os.path.join(result_folder, result_file) for result_file in C_result_files]
word_vector_result_files = ['TDParse word vector.json', 'TDParsePlus word vector.json']
word_vector_result_files = [os.path.join(result_folder, result_file) for result_file in word_vector_result_files]
# Parameters for each model
std_model_parameters = {'word_vector' : [sswe], 'random_state' : 42}
all_senti_model_parameters = {**std_model_parameters, 'sentiment_lexicon' : all_three_low}
model_parameters = [std_model_parameters, all_senti_model_parameters]
# Combining parameters and result files
parameters_files = list(zip(model_parameters, model_result_files, C_result_files, 
                            word_vector_result_files, [model_dir]*2))

model_files = dict(zip(models, parameters_files))

In [9]:
for dataset_name, train_test in dataset_train_test.items():
    print('Processing dataset {}'.format(dataset_name))
    train, test = train_test
    for model, parameter_file_paths in model_files.items():
        print('Processing model {}'.format(model))
        params_files = parameter_file_paths
        parameters = params_files[0]
        result_file_path = params_files[1]
        c_fp = params_files[2]
        word_vectors_fp = params_files[3]
        model_dir = params_files[4]
        dataset_predictions(train, test, dataset_name, model, 
                            result_file_path=result_file_path,
                            re_write=False, save_raw_data=True,
                            c_file_path=c_fp,
                            word_vector_file_path=word_vectors_fp, 
                            model_dir=model_dir,
                            **parameters)

Processing dataset Mitchel
Processing model TDParse
Processing model TDParse Plus
Processing dataset YouTuBean
Processing model TDParse
Processing model TDParse Plus
Processing dataset Election Twitter
Processing model TDParse
Processing model TDParse Plus
Processing dataset Dong Twitter
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 14 Restaurant
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 14 Laptop
Processing model TDParse
Processing model TDParse Plus
