## 0. Init

In [395]:
import os
import pandas as pd
import numpy as np
import math
from gensim.models import Word2Vec, Doc2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
from skorch import NeuralNetClassifier
from sklearn import utils
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_recall_fscore_support, balanced_accuracy_score, accuracy_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch import utils
from tqdm.auto import tqdm
import re
import time
import warnings
import copy

import nltk
nltk.download('vader_lexicon')
# nltk.download('words')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
from nltk.util import bigrams
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer,word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings(action='ignore')
path = r"{PATH}"
os.chdir(path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\yongz\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## 1. Pre-processing

In [4]:
data = pd.read_csv('train.csv',
                   dtype={'Sentence_id':int,'Text':str,'Verdict':int})
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
data['Text_lower'] = data['Text'].apply(lambda x:x.lower())
data['Tokens'] = data['Text_lower'].apply(word_tokenize)
data['Tokens_lem'] = data['Tokens'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])
data['Tokens_no_stop'] = data['Tokens'].map(lambda x: [w for w in x if not w.lower() in stop_words])
data['Tokens_lem_no_stop'] = data['Tokens_lem'].map(lambda x: [w for w in x if not w.lower() in stop_words])
data['Tokens_2'] = data['Tokens'].apply(generate_bigram_list)
data['POS_tags'] = data['Tokens'].apply(generate_pos_list)
data.head() # 5 seconds

Unnamed: 0,Sentence_id,Text,Verdict,Text_lower,Tokens,Tokens_lem,Tokens_no_stop,Tokens_lem_no_stop
0,1,I think we've seen a deterioration of values.,-1,i think we've seen a deterioration of values.,"[i, think, we, 've, seen, a, deterioration, of...","[i, think, we, 've, seen, a, deterioration, of...","[think, 've, seen, deterioration, values, .]","[think, 've, seen, deterioration, value, .]"
1,2,I think for a while as a nation we condoned th...,-1,i think for a while as a nation we condoned th...,"[i, think, for, a, while, as, a, nation, we, c...","[i, think, for, a, while, a, a, nation, we, co...","[think, nation, condoned, things, condemned, .]","[think, nation, condoned, thing, condemned, .]"
2,3,"For a while, as I recall, it even seems to me ...",-1,"for a while, as i recall, it even seems to me ...","[for, a, while, ,, as, i, recall, ,, it, even,...","[for, a, while, ,, a, i, recall, ,, it, even, ...","[,, recall, ,, even, seems, talk, legalizing, ...","[,, recall, ,, even, seems, wa, talk, legalizi..."
3,4,"So we've seen a deterioration in values, and o...",-1,"so we've seen a deterioration in values, and o...","[so, we, 've, seen, a, deterioration, in, valu...","[so, we, 've, seen, a, deterioration, in, valu...","['ve, seen, deterioration, values, ,, one, thi...","['ve, seen, deterioration, value, ,, one, thin..."
4,5,"We got away, we got into this feeling that val...",-1,"we got away, we got into this feeling that val...","[we, got, away, ,, we, got, into, this, feelin...","[we, got, away, ,, we, got, into, this, feelin...","[got, away, ,, got, feeling, value-, free, edu...","[got, away, ,, got, feeling, value-, free, edu..."


In [218]:
# To try:
    # Vary list of stop words excluded
    # Further experiments with tokenization

In [296]:
def generate_bigram_list(word_list):
    bg_list = list(bigrams(word_list))
    bg_list = [' '.join(bigram) for bigram in bg_list]
    return bg_list

def generate_pos_list(token_list):
    pos_list = nltk.pos_tag(token_list)
    pos_list = [tup[1] for tup in pos_list]
    return pos_list

## 2. Model

In [300]:
# Train-Val Split
train_idx, val_idx = train_test_split(np.arange(len(data)),random_state=1)
train_df = data.loc[train_idx].reset_index(drop=True)
val_df = data.loc[val_idx].reset_index(drop=True)

### 2.1 Naive Bayes

In [471]:
class NaiveBayes:
    def __init__(self,train_data,token_col,k=1,nb_type='unigram'):
        self.train_data = train_data
        self.token_col = token_col
        self.k = k
        self.nb_type = nb_type
        
        # Generate corpus
        corpus = []
        for i in range(len(self.train_data)):
            corpus.extend(self.train_data.at[i,token_col])
        self.corpus = set(corpus)
        self.V = len(self.corpus)
        
        # Generate aggregate likelihood df
        prob_df = train_data['Verdict'].value_counts().reset_index()
        prob_df['prob'] = prob_df['count']/len(train_data)
        prob_df = prob_df.set_index('Verdict')
        self.prob_df = prob_df
        
    def generate_prob(self):
        df = self.train_data
        token_col = self.token_col
        V = self.V
        k = self.k
        corpus = self.corpus
        
        words_df = pd.DataFrame(columns=['word','count_impt','count_unimpt','count_false'])
        if self.nb_type == 'unigram':
            col_to_count = 'Tokens'
        elif self.nb_type == 'bigram':
            col_to_count = 'Tokens_2'
        elif self.nb_type == 'pos':
            col_to_count = 'POS_tags'
        data_impt = df.loc[df['Verdict'] == 1,col_to_count].tolist()
        data_unimpt = df.loc[df['Verdict'] == 0,col_to_count].tolist()
        data_false = df.loc[df['Verdict'] == -1,col_to_count].tolist()
        for word in tqdm(corpus):
            count_impt = sum(word in text for text in data_impt)
            count_unimpt = sum(word in text for text in data_unimpt)
            count_false = sum(word in text for text in data_false)
            words_df.loc[len(words_df)] = [word, count_impt, count_unimpt, count_false]
        N_impt = words_df['count_impt'].sum()
        N_unimpt = words_df['count_unimpt'].sum()
        N_false = words_df['count_false'].sum()
        self.N_impt = N_impt
        self.N_unimpt = N_unimpt
        self.N_false = N_false
        words_df['prob_impt'] = (words_df['count_impt'] + k)/(N_impt + (k*V))
        words_df['prob_unimpt'] = (words_df['count_unimpt'] + k)/(N_unimpt + (k*V))
        words_df['prob_false'] = (words_df['count_false'] + k)/(N_false + (k*V))
        words_df = words_df.set_index('word')
        self.words_df = words_df
        
    def get_pred(self,val_data1):
        val_data = val_data1.copy()
        prob_df = self.prob_df
        words_df = self.words_df
        token_col = self.token_col
        k = self.k
        V = self.V
        N_impt = self.N_impt
        N_unimpt = self.N_unimpt
        N_false = self.N_false
        val_data['Prediction'] = None

        for i in tqdm(range(len(val_data))):
            prob_impt = prob_df.at[1,'prob']
            prob_unimpt = prob_df.at[0,'prob']
            prob_false = prob_df.at[-1,'prob']
            for n in range(len(val_data.at[i,token_col])):
                try:
                    prob_impt += math.log(words_df.at[val_data.at[i,token_col][n],'prob_impt'])
                    prob_unimpt += math.log(words_df.at[val_data.at[i,token_col][n],'prob_unimpt'])
                    prob_false += math.log(words_df.at[val_data.at[i,token_col][n],'prob_false'])
                except:
                    prob_impt += math.log(k/(N_impt + (k*V)))
                    prob_unimpt += math.log(k/(N_unimpt + (k*V)))
                    prob_false += math.log(k/(N_false + (k*V)))
            rank = pd.DataFrame.from_dict({'type':[1,0,-1],'prob':[prob_impt,prob_unimpt,prob_false]})
            rank = rank.sort_values(by='prob',ascending=False).reset_index(drop=True)
            val_data.loc[i,'prob_impt'] = prob_impt
            val_data.loc[i,'prob_unimpt'] = prob_unimpt
            val_data.loc[i,'prob_false'] = prob_false
            val_data.loc[i,'Prediction'] = rank.at[0,'type']
        
        return val_data

#### 2.1.1 Train and evaluate base unigram model

In [453]:
# Note: required for subsequent feature generation
nb1 = NaiveBayes(train_df,'Tokens')
nb1.generate_prob()
val_df_nb1 = nb1.get_pred(val_df)

  0%|          | 0/11105 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [454]:
nb_control_balanced_acc = balanced_accuracy_score(val_df_nb1['Verdict'].tolist(),val_df_nb1['Prediction'].tolist())
nb_control_acc = accuracy_score(val_df_nb1['Verdict'].tolist(),val_df_nb1['Prediction'].tolist())
print(f'Accuracy:{nb_control_acc}')
print(f'Balanced accuracy:{nb_control_balanced_acc}')
print('\nClassification metrics:\n',classification_report(val_df_nb1['Verdict'].tolist(),val_df_nb1['Prediction'].tolist(),target_names = ['-1','0','1']))

Accuracy:0.7644863135442588
Balanced accuracy:0.6083724747464544

Classification metrics:
               precision    recall  f1-score   support

          -1       0.84      0.88      0.86      3727
           0       0.40      0.32      0.35       565
           1       0.67      0.63      0.65      1334

    accuracy                           0.76      5626
   macro avg       0.63      0.61      0.62      5626
weighted avg       0.75      0.76      0.76      5626



#### 2.1.2 Train and evaluate bigram model

In [324]:
nb2 = NaiveBayes(train_df,'Tokens_2',nb_type='bigram')
nb2.generate_prob()
train_df_nb2 = nb2.get_pred(train_df)
val_df_nb2 = nb2.get_pred(val_df)

  0%|          | 0/93059 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [436]:
nb2_control_balanced_acc = balanced_accuracy_score(val_df_nb2['Verdict'].tolist(),val_df_nb2['Prediction'].tolist())
nb2_control_acc = accuracy_score(val_df_nb2['Verdict'].tolist(),val_df_nb2['Prediction'].tolist())
print(f'Accuracy:{nb2_control_acc}')
print(f'Balanced accuracy:{nb2_control_balanced_acc}')
print('\nClassification metrics:\n',classification_report(val_df_nb2['Verdict'].tolist(),val_df_nb2['Prediction'].tolist(),target_names = ['-1','0','1']))

Accuracy:0.6837895485247067
Balanced accuracy:0.5799854171800247

Classification metrics:
               precision    recall  f1-score   support

          -1       0.84      0.76      0.80      3727
           0       0.28      0.39      0.32       565
           1       0.55      0.59      0.57      1334

    accuracy                           0.68      5626
   macro avg       0.55      0.58      0.56      5626
weighted avg       0.71      0.68      0.70      5626



#### 2.1.3 Ablation Study for Naive Bayes
Impact analysis for the following variables

**Lemmatization**

Lemmatize individual tokens

In [514]:
nb1_lem = NaiveBayes(train_df,'Tokens_lem')
nb1_lem.generate_prob()
val_df_nb1_lem = nb1_lem.get_pred(val_df)

  0%|          | 0/9912 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [515]:
nb_lem_acc = accuracy_score(val_df_nb1_lem['Verdict'].tolist(),val_df_nb1_lem['Prediction'].tolist())
nb_lem_balanced_acc = balanced_accuracy_score(val_df_nb1_lem['Verdict'].tolist(),val_df_nb1_lem['Prediction'].tolist())
print(f'Accuracy:{nb_lem_balanced_acc}')
print(f'Balanced accuracy comparison:\ncontrol:{nb_control_balanced_acc}, treatment:{nb_lem_balanced_acc}')
print('\nClassification metrics:\n',classification_report(val_df_nb1_lem['Verdict'].tolist(),val_df_nb1_lem['Prediction'].tolist(),target_names = ['-1','0','1']))

Accuracy:0.6133605123896734
Balanced accuracy comparison:
control:0.6083724747464544, treatment:0.6133605123896734

Classification metrics:
               precision    recall  f1-score   support

          -1       0.85      0.85      0.85      3727
           0       0.34      0.34      0.34       565
           1       0.64      0.65      0.64      1334

    accuracy                           0.75      5626
   macro avg       0.61      0.61      0.61      5626
weighted avg       0.75      0.75      0.75      5626



**Stop Words**

Remove stop words from text

In [516]:
nb1_sw = NaiveBayes(train_df,'Tokens_no_stop')
nb1_sw.generate_prob()
val_df_nb1_sw = nb1_sw.get_pred(val_df)

  0%|          | 0/10976 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [517]:
nb_sw_acc = accuracy_score(val_df_nb1_sw['Verdict'].tolist(),val_df_nb1_sw['Prediction'].tolist())
nb_sw_balanced_acc = balanced_accuracy_score(val_df_nb1_sw['Verdict'].tolist(),val_df_nb1_sw['Prediction'].tolist())
print(f'Accuracy:{nb_sw_acc}')
print(f'Balanced accuracy comparison:\ncontrol:{nb_control_balanced_acc}, treatment:{nb_sw_balanced_acc}')
print('\nClassification metrics:\n',classification_report(val_df_nb1_sw['Verdict'].tolist(),val_df_nb1_sw['Prediction'].tolist(),target_names = ['-1','0','1']))

Accuracy:0.7488446498400284
Balanced accuracy comparison:
control:0.6083724747464544, treatment:0.5956504110565249

Classification metrics:
               precision    recall  f1-score   support

          -1       0.83      0.86      0.85      3727
           0       0.37      0.31      0.34       565
           1       0.64      0.61      0.62      1334

    accuracy                           0.75      5626
   macro avg       0.61      0.60      0.60      5626
weighted avg       0.74      0.75      0.74      5626



**Case**

Preserve Uppercase letters

In [None]:
train_df['Tokens_case'] = train_df['Text'].apply(word_tokenize)
nb1_case = NaiveBayes(train_df,'Tokens_case')
nb1_case.generate_prob()

In [521]:
val_df['Tokens_case'] = val_df['Text'].apply(word_tokenize)
val_df_nb1_case = nb1_case.get_pred(val_df)

  0%|          | 0/5626 [00:00<?, ?it/s]

In [522]:
nb_case_acc = accuracy_score(val_df_nb1_case['Verdict'].tolist(),val_df_nb1_case['Prediction'].tolist())
nb_case_balanced_acc = balanced_accuracy_score(val_df_nb1_case['Verdict'].tolist(),val_df_nb1_case['Prediction'].tolist())
print(f'Accuracy comparison:{nb_case_acc}')
print(f'Balanced accuracy comparison:\ncontrol:{nb_control_balanced_acc}, treatment:{nb_case_balanced_acc}')
print('\nClassification metrics:\n',classification_report(val_df_nb1_case['Verdict'].tolist(),val_df_nb1_case['Prediction'].tolist(),target_names = ['-1','0','1']))

Accuracy comparison:0.654639175257732
Balanced accuracy comparison:
control:0.6083724747464544, treatment:0.6311573992821279

Classification metrics:
               precision    recall  f1-score   support

          -1       0.89      0.68      0.77      3727
           0       0.24      0.60      0.34       565
           1       0.60      0.61      0.61      1334

    accuracy                           0.65      5626
   macro avg       0.58      0.63      0.57      5626
weighted avg       0.76      0.65      0.69      5626



### 2.2 Logistic Regression
#### 2.2.1 Feature Engineering
*Note that the features engineered here are also applicable to the Neural Network models

**Sandbox Utils**

Experimental code has been removed

In [11]:
def eval_dist(verdict_df):
    total = float(len(verdict_df))
    try:
        impt = float(verdict_df.value_counts()[1])
    except: impt = 0
    try:
        not_impt = float(verdict_df.value_counts()[0])
    except:
        not_impt = 0
    try:
        false = float(verdict_df.value_counts()[-1])
    except:
        false = 0
    print('important %:',impt/total,'not important %:',not_impt/total,'false %:',false/total)

def feature_test(substr):
    if data['Text'].str.contains(substr).any():
        test_0 = data.loc[data['Text'].str.contains(substr),'Text'].to_list()
        verdict_df = data.loc[data['Text'].str.contains(substr),'Verdict']
        test_1 = verdict_df.to_list()
        test = list(zip(test_0,test_1))
        eval_dist(verdict_df)
        return test
    else:
        print('substring not found')    

In [None]:
list(data.loc[data['Verdict']==-1,'Text'].to_numpy()[np.random.choice(10000,20)]) # False examples

**Feature Generation**

In [455]:
# Generate bayesian probabilities for datasets

train_df_likelihood = nb1.get_pred(train_df)
val_df_likelihood = nb1.get_pred(val_df)

  0%|          | 0/16875 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [278]:
# *Getting top words per category:
def get_top_tokens(nb_obj):
    words_df = nb_obj.words_df
    total_word_count = nb_obj.N_impt + nb_obj.N_unimpt + nb_obj.N_false
    texts_impt_count = train_df['Verdict'].value_counts()[1]
    texts_unimpt_count = train_df['Verdict'].value_counts()[0]
    texts_false_count = train_df['Verdict'].value_counts()[-1]
    total_text_count = len(train_df)
    impt_prior = texts_impt_count/total_text_count
    unimpt_prior = texts_unimpt_count/total_text_count
    false_prior = texts_false_count/total_text_count

    words_df['prob_post_impt'] = words_df.apply(lambda r: r.prob_impt*impt_prior/((r.count_impt+r.count_unimpt+r.count_false)/total_word_count),axis=1)
    words_df['prob_post_unimpt'] = words_df.apply(lambda r: r.prob_unimpt*unimpt_prior/((r.count_impt+r.count_unimpt+r.count_false)/total_word_count),axis=1)
    words_df['prob_post_false'] = words_df.apply(lambda r: r.prob_false*false_prior/((r.count_impt+r.count_unimpt+r.count_false)/total_word_count),axis=1)

    top_impt = words_df.sort_values('prob_post_impt',ascending=False).head(2000).index.tolist()
    top_unimpt = words_df.sort_values('prob_post_unimpt',ascending=False).head(2000).index.tolist()
    top_false = words_df.sort_values('prob_post_false',ascending=False).head(2000).index.tolist()
    
    return top_impt, top_unimpt, top_false

top_impt, top_unimpt, top_false = get_top_tokens(nb1) # unigrams
top_impt_bg, top_unimpt_bg, top_false_bg = get_top_tokens(nb2) # bigrams
        
def count_top_tokens(token_list,target_list):
    count = 0
    for w in token_list:
        if w in target_list:
            count += token_list.count(w)
    return count

In [21]:
# *Named entity recognition

def identify_names(text):
    result = 0
    for chunk in nltk.ne_chunk(nltk.pos_tag(word_tokenize(text))):
        if hasattr(chunk,'label'):
            result += 1
            break
    return result

In [22]:
# *Average of word vectors

ft_size = 100
corpus = train_df['Tokens'].tolist()
ft = FastText(corpus,vector_size=ft_size,epochs=10)

def generate_ft_avg(token_list,ft_size,model):
    vector = np.zeros(ft_size)
    for token in token_list:
        vector += model.wv[token]
    vector /= len(token_list)
    return vector
        
def generate_ft_emb(df,model):
    emb = []
    for i in tqdm(range(len(df))):
        emb.append(generate_ft_avg(df.at[i,'Tokens'],model))
    return np.array(emb)

X_train_ft = generate_ft_emb(train_df,ft)
X_val_ft = generate_ft_emb(val_df,ft)

  0%|          | 0/16875 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [None]:
# *Sentiment of text

sentiment_analyzer = SentimentIntensityAnalyzer()
def generate_sentiment(text,analyzer,sentiment_type):
    '''
    type: pos | neg | neu | compound 
    '''
    scores = analyzer.polarity_scores(text)
    return scores[sentiment_type]

In [127]:
# *Frequency of stopwords

def generate_stopword_count(token_list,stop_words):
    count = 0
    for token in token_list:
        if token in stop_words:count += 1
    return count

In [477]:
# *POS tags
nb_pos = NaiveBayes(train_df,'POS_tags',nb_type='pos')
nb_pos.generate_prob()
train_df_nb_pos = nb_pos.get_pred(train_df)
val_df_nb_pos = nb_pos.get_pred(val_df)

# Marginal impact

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/16875 [00:00<?, ?it/s]

  0%|          | 0/5626 [00:00<?, ?it/s]

In [457]:
def generate_logreg_features(df,df_likelihood,bg_likelihood,pos_likelihood):
    '''
    Brainstorm:
    Me pronouns - I, You, me
    Collective pronouns - they,
    Presence of numbers (numeric or spelt)
    Filler words - uh or uhh
    ' - ' or '--'
    No. of words with highest bayesian likelihoods per class ** (didn't work)
    Acronyms
    Mr(s) and Senator
    Identify names (NER)
    
    Sentiment scores?
    '''
    df_log = df.copy()
    
    # Features: Hard-coded
    df_log['_1'] = df_log['Text'].str.contains('!').astype(int)
    df_log['_2'] = df_log['Text'].str.contains('\?').astype(int)
    df_log['_3'] = df_log['Text'].map(len) # Try len of token list
    df_log['_4'] = df_log['Text'].str.count('I |me |[Yy]ou ')
    df_log['_5'] = df_log['Text'].str.count('[Ww]e |[Tt]hey |us[. ]')
    df_log['_6'] = df_log['Text'].str.contains('[Uu][h]+ ').astype(int)
    df_log['_7'] = df_log['Text'].str.contains(' [-]+ ').astype(int)
    df_log['_8'] = df_log['Text'].str.contains(num_pattern).astype(int)
    df_log['_9'] = df_log['Text'].str.contains('[A-Z]{2,}').astype(int)
    df_log['_10'] = df_log['Text'].str.contains('Mr[s]?|Senator').astype(int)
    df_log['_11'] = df_log['Text'].map(identify_names)
    df_log['_12'] = df_log['Text'].apply(lambda x: generate_sentiment(x,sentiment_analyzer,'pos'))
    df_log['_13'] = df_log['Text'].apply(lambda x: generate_sentiment(x,sentiment_analyzer,'neu'))
    df_log['_14'] = df_log['Text'].apply(lambda x: generate_sentiment(x,sentiment_analyzer,'neg'))
    df_log['_15'] = df_log['Tokens'].map(lambda x: generate_stopword_count(x,stop_words))
    
#     df_log['_16'] = df_log['Tokens_2'].apply(lambda x:count_top_words(x,top_impt))
#     df_log['_17'] = df_log['Tokens_2'].apply(lambda x:count_top_words(x,top_unimpt))
#     df_log['_18'] = df_log['Tokens_2'].apply(lambda x:count_top_words(x,top_false))

    # Features: Bayesian likelihood
    temp_cols = ['impt','unimpt','false'] 
    temp = list(zip(['_BL1','_BL2','_BL3'],temp_cols))
    for feature,col in temp:
        df_log[feature] = df_likelihood[f'prob_{col}']
        
    temp_bg = list(zip(['_BG1','_BG2','_BG3'],temp_cols))
    for feature,col in temp_bg:
        df_log[feature] = bg_likelihood[f'prob_{col}']
        
    temp_pos = list(zip(['_POS1','_POS2','_POS3'],temp_cols))
    for feature,col in temp_pos:
        df_log[feature] = pos_likelihood[f'prob_{col}']

    return df_log

def select_cols(end_col_num=None,col_list=None,Bayesian=True,Bigram=True,POS=True):
    '''
    col_list: e.g. [1,6,8,...]
    '''
    col_list0 = []
    if col_list == None:
        for i in range(end_col_num):
            col_list0.append(f'_{i+1}')
    else:
        for i in col_list:
            col_list0.append(f'_{i}')
    if Bayesian == True:
        col_list0.extend(['_BL1','_BL2','_BL3'])
    if Bigram == True:
        col_list0.extend(['_BG1','_BG2','_BG3'])
    if POS == True:
        col_list0.extend(['_POS1','_POS2','_POS3'])
    return col_list0

In [458]:
# To re-run this cell after every feature update
train_df_log = generate_logreg_features(train_df,train_df_likelihood,train_df_nb2,train_df_nb_pos)
val_df_log = generate_logreg_features(val_df,val_df_likelihood,val_df_nb2,val_df_nb_pos)

#### 2.2.2 Train and Evaluate Base Model
Features only, without embeddings

In [492]:
scaler = StandardScaler()
X_train_0 = train_df_log[select_cols(end_col_num=15,Bigram=False)].to_numpy() # all features
X_train_0 = scaler.fit_transform(X_train_0)
y_train = train_df_log['Verdict'].to_numpy()
X_val_0 = val_df_log[select_cols(end_col_num=15,Bigram=False)].to_numpy() # all features
X_val_0 = scaler.fit_transform(X_val_0)
y_val = val_df_log['Verdict'].to_numpy()

In [494]:
logreg_clf1 = LogisticRegression(max_iter=500)
logreg_clf1.fit(X_train_0,y_train)
y_pred1 = logreg_clf1.predict(X_val_0)
print(accuracy_score(y_val,y_pred1))
print(balanced_accuracy_score(y_val,y_pred1))
print(classification_report(y_val,y_pred1))

0.7808389619623178
0.5663351825966391
              precision    recall  f1-score   support

          -1       0.81      0.94      0.87      3727
           0       0.55      0.16      0.25       565
           1       0.72      0.60      0.65      1334

    accuracy                           0.78      5626
   macro avg       0.69      0.57      0.59      5626
weighted avg       0.76      0.78      0.75      5626



In [353]:
balanced_accuracy_score(y_val,y_pred1)

0.5511084706156364

#### 2.2.3 Ablation Study

Features + Doc Embeddings:

In [27]:
# Features: Document embeddings
def generate_taggeddocs(df):
    base_df = df[['Tokens','Verdict']]
    d2v_df_temp = base_df.apply(lambda r: TaggedDocument(words=r['Tokens'],
                                        tags=[r.Verdict]),axis=1)
    d2v_values = d2v_df_temp.tolist()
    return d2v_values

d2v_train_values = generate_taggeddocs(train_df_log)
d2v = Doc2Vec(vector_size=300, epochs=40) # Tune
d2v.build_vocab(d2v_train_values)
for epoch in tqdm(range(50)):
    d2v.train(utils.shuffle(d2v_train_values),total_examples=len(d2v_train_values),epochs=1)
    d2v.alpha -= 0.002 # can tune
    d2v.min_alpha = d2v.alpha

  0%|          | 0/50 [00:00<?, ?it/s]

In [408]:
X_train_d2v = np.array([d2v.infer_vector(doc.words) for doc in d2v_train_values]) # Embeddings only
X_train = np.concatenate((X_train_0,X_train_d2v),axis=1) # all features + emb
d2v_val_values = generate_taggeddocs(val_df_log)
X_val_d2v = np.array([d2v.infer_vector(doc.words) for doc in d2v_val_values]) # Embeddings only
X_val = np.concatenate((X_val_0,X_val_d2v),axis=1) # all features + emb

In [491]:
logreg_clf2 = LogisticRegression(max_iter=1000)
logreg_clf2.fit(X_train,y_train)
y_pred2 = logreg_clf2.predict(X_val)
print(accuracy_score(y_val,y_pred2))
print(balanced_accuracy_score(y_val,y_pred2))
print(classification_report(y_val,y_pred2))

0.7579097049413438
0.5506263748057397
              precision    recall  f1-score   support

          -1       0.80      0.91      0.85      3727
           0       0.46      0.16      0.24       565
           1       0.65      0.58      0.61      1334

    accuracy                           0.76      5626
   macro avg       0.64      0.55      0.57      5626
weighted avg       0.73      0.76      0.73      5626



Just Doc Embeddings:

In [410]:
logreg_clf3 = LogisticRegression(max_iter=500)
logreg_clf3.fit(X_train_d2v,y_train)
y_pred3 = logreg_clf3.predict(X_val_d2v)
print(classification_report(y_val,y_pred3))

              precision    recall  f1-score   support

          -1       0.68      0.98      0.80      3727
           0       0.00      0.00      0.00       565
           1       0.55      0.09      0.16      1334

    accuracy                           0.67      5626
   macro avg       0.41      0.36      0.32      5626
weighted avg       0.58      0.67      0.57      5626



Just Word Embeddings:

In [411]:
logreg_clf4 = LogisticRegression(max_iter=500)
logreg_clf4.fit(X_train_ft,y_train)
y_pred4 = logreg_clf4.predict(X_val_ft)
print(classification_report(y_val,y_pred4))

              precision    recall  f1-score   support

          -1       0.79      0.92      0.85      3727
           0       0.45      0.13      0.20       565
           1       0.65      0.54      0.59      1334

    accuracy                           0.75      5626
   macro avg       0.63      0.53      0.55      5626
weighted avg       0.72      0.75      0.72      5626



Word Embeddings + Features:

In [432]:
X_train_ft1 = np.concatenate((X_train_0,X_train_ft),axis=1) # all features + FT emb
X_val_ft1 = np.concatenate((X_val_0,X_val_ft),axis=1) # all features + FT emb
logreg_clf5 = LogisticRegression()
logreg_clf5.fit(X_train_ft1,y_train)
y_pred5 = logreg_clf5.predict(X_val_ft1)
print(accuracy_score(y_val,y_pred5) )
print(balanced_accuracy_score(y_val,y_pred5))
print(classification_report(y_val,y_pred5))

0.7619978670458585
0.5562256077144367
              precision    recall  f1-score   support

          -1       0.80      0.92      0.86      3727
           0       0.48      0.18      0.26       565
           1       0.67      0.57      0.62      1334

    accuracy                           0.76      5626
   macro avg       0.65      0.56      0.58      5626
weighted avg       0.74      0.76      0.74      5626



Test different feature combinations to get best val results:

In [499]:
temp_col_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
X_train_2 = train_df_log[select_cols(col_list=temp_col_list,Bigram=False)].to_numpy()
# X_train_2 = train_df_log[select_cols(end_col_num=15,Bigram=False)].to_numpy()
X_train_2 = scaler.fit_transform(X_train_2)
X_val_2 = val_df_log[select_cols(col_list=temp_col_list,Bigram=False)].to_numpy()
# X_val_2 = val_df_log[select_cols(end_col_num=15,Bigram=False)].to_numpy()
X_val_2 = scaler.fit_transform(X_val_2)
X_train_ft2 = np.concatenate((X_train_2,X_train_ft),axis=1) # all features + FT emb
X_val_ft2 = np.concatenate((X_val_2,X_val_ft),axis=1) # all features + FT emb
logreg_clf6 = LogisticRegression()
logreg_clf6.fit(X_train_ft2,y_train)
y_pred6 = logreg_clf6.predict(X_val_ft2)
print(accuracy_score(y_val,y_pred6))
print(balanced_accuracy_score(y_val,y_pred6))
print(classification_report(y_val,y_pred6))

# 0.7842161393530039
# 0.5941908865394395

0.7842161393530039
0.5941908865394395
              precision    recall  f1-score   support

          -1       0.82      0.93      0.87      3727
           0       0.52      0.24      0.32       565
           1       0.71      0.62      0.66      1334

    accuracy                           0.78      5626
   macro avg       0.68      0.59      0.62      5626
weighted avg       0.76      0.78      0.77      5626



In [513]:
np.array(logreg_clf6.coef_)[1,22:]

array([-0.25792505, -0.25041781,  0.56375018,  0.02001205,  0.04213407,
        0.78444148, -0.21239551,  0.1095592 ,  0.12627011, -0.14101416,
        0.49108066, -0.25837697, -0.46287509,  0.03085525,  0.30895309,
        0.49858017,  0.14578567,  0.20938721,  0.08791701,  0.41204225,
        0.25806333,  0.19489786, -0.37420876,  0.25441852, -0.20194479,
       -0.34077031,  0.55715071,  0.09016396,  0.58312911,  0.07695484,
        0.11397774,  0.20077722,  0.19239341,  0.10101745,  0.22540026,
        0.52902543, -0.89207404, -0.49265925,  0.10659941, -0.12813442,
       -0.05903449, -0.069048  , -0.14622801,  0.62300087, -0.05370399,
       -0.2370373 , -0.37053659,  0.05917016, -0.16815537, -0.14304047,
       -0.89829032, -0.55927079,  0.2026039 , -0.21964684,  0.19578068,
        0.50884602, -0.00353546, -0.06262991, -0.80183266,  0.34847094,
       -0.23624475,  0.0373517 , -0.38931693,  0.32732837,  0.00699078,
       -0.69703807, -0.46258959,  0.1837601 , -0.34539193, -0.16

### Artificial Neural Networks

#### Utils

In [384]:
def convert_tensor(X,dtype):
    X = torch.from_numpy(X)
    if dtype =='float':
        X = X.type(torch.FloatTensor)
    elif dtype == 'long':
        X = X.type(torch.LongTensor)
    return X

def convert_y(y):
    y_nn = np.zeros([y.shape[0],3])
    for i in range(len(y)):
        if y[i] == -1:
            y_nn[i,0] = 1
        elif y[i] == 0:
            y_nn[i,1] = 1
        elif y[i] == 1: 
            y_nn[i,2] = 1
    return y_nn

# Troubleshooting
def check_dist(arr):
    unique, counts = np.unique(arr,return_counts=True)
    print(list(zip(unique,counts)))

y_train_nn = convert_tensor(convert_y(y_train),'float')
y_val_nn = convert_tensor(convert_y(y_val),'float')

#### Basic Neural Network

In [149]:
# Init

class ANN(nn.Module):
    def __init__(self,input_size,
                 num_units1,num_units2,
#                  num_units3,
                 act1,act2,
#                  act3
                ):
        super().__init__()
        self.hidden1 = nn.Linear(input_size,num_units1)
        self.act1 = act1
#         self.dropout1 = nn.Dropout(0.1)
#         self.hidden2 = nn.Linear(num_units1,num_units2) # tune
#         self.act2 = act2 # tune
#         self.dropout2 = nn.Dropout(0.1)
#         self.hidden3 = nn.Linear(num_units2,num_units3) # tune
#         self.act3 = act3 # tune        
        self.output = nn.Linear(num_units1,3) # tune
        self.softmax = nn.Softmax(dim=1) # tune
    def forward(self,x):
        x = self.hidden1(x)
        x = self.act1(x)
#         x = self.dropout1(x)
#         x = self.hidden2(x)
#         x = self.act2(x)
#         x = self.dropout2(x)
#         x = self.hidden3(x)
#         x = self.act3(x)
        x = self.output(x)
        x = self.softmax(x)
        return x

# Set up datasets
# # Choose datasets based on feature choice
X_train_base = scaler.fit_transform(train_df_log[select_cols(end_col_num=11)].to_numpy()) # selected features
X_val_base = scaler.fit_transform(val_df_log[select_cols(end_col_num=11)].to_numpy())
X_train_chosen = X_train_ft
X_val_chosen = X_val_ft
# X_train_chosen = X_train_base
# X_val_chosen = X_val_base
# X_train_chosen = np.concatenate((X_train_base,X_train_d2v),axis=1)
# X_val_chosen = np.concatenate((X_val_base,X_val_d2v),axis=1)
# X_train_chosen = X_train_d2v
# X_val_chosen = X_val_d2v

# # Process datasets
input_size = X_train_chosen.shape[1]
X_train_nn = convert_tensor(X_train_chosen,'float')
X_val_nn = convert_tensor(X_val_chosen,'float')

In [389]:
# Train model
def train_ann_model(model,file_name,loss_fn,optimizer,loader,cat=False,**kwargs):
    
    X_val_text_nn, X_val_ftr_nn = kwargs.get('X_val_text_nn', None),kwargs.get('X_val_ftr_nn', None)
    n_epochs = 40
    results_df = pd.DataFrame(columns=['epoch','bal_accuracy','precision','recall','fscore','support'])
    epoch_loss = []
    best_loss = float('inf')
    
    for epoch in tqdm(range(n_epochs)):
        model.train()
        if cat==False:
            for X_batch, y_batch in loader:
                y_pred_batch = model(X_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        elif cat==True:
            for X_text_batch,X_ftr_batch,y_batch in loader:
                y_pred_batch = model(X_text_batch,X_ftr_batch)
                loss = loss_fn(y_pred_batch,y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        model.eval()
        
        with torch.no_grad():
            if cat==False:
                y_pred_nn = model(X_val_nn)
            elif cat==True:
                y_pred_nn = model(X_val_text_nn,X_val_ftr_nn)
                        
            y_pred = torch.argmax(y_pred_nn,1)
            y_pred = y_pred.numpy()
            y_pred[y_pred==0] = -1
            y_pred[y_pred==1] = 0
            y_pred[y_pred==2] = 1
            check_dist(y_pred)
            bal_accuracy = balanced_accuracy_score(y_val,y_pred)
            epoch_loss.append(loss)
            precision, recall, fscore, support = precision_recall_fscore_support(y_val,y_pred,average='weighted')
            results_df.loc[len(results_df)] = [epoch+1,bal_accuracy, precision,recall,fscore,support]
            val_loss = loss_fn(y_pred_nn,y_val_nn)
            tqdm.write(f"Epoch{epoch+1}: val loss={val_loss}, balanced accuracy={bal_accuracy}, precision={precision}, recall={recall}, fscore={fscore}")
            
            # Early Stopping
            if val_loss < best_loss:
                best_loss = val_loss
                best_model_weights = copy.deepcopy(model.state_dict())
                patience = 10
            else:
                patience -= 1
                if patience == 0:
                    break
    
    torch.save(best_model_weights,file_name)
    return model

In [None]:
# Model Init
# # Parameter experimentation (pre-tuning)
num_units1 = 80
num_units2 = 60
num_units3 = 5
act1 = nn.ELU()
act2 = nn.Sigmoid()
act3 = nn.Sigmoid()

# # Core Init
ann0 = ANN(input_size,
           num_units1,num_units2,
#            num_units3,
           act1,act2
#            ,act3
          )
optimizer = optim.Adam(ann0.parameters(),lr=0.004) # tune
loss_fn = nn.CrossEntropyLoss()
loader = utils.data.DataLoader(utils.data.TensorDataset(X_train_nn,y_train_nn),batch_size=2)
ann0 = train_ann_model(ann0,'ann1.pt',loss_fn,optimizer,loader)

#### Neural Network with Concat Structure

In [540]:
# Init

class ANN_Cat(nn.Module):
    def __init__(self,
                 input_size_text,input_size_ftr,
                 num_units_text1,num_units_text2,num_units_ftr1,num_units_ftr2,num_units_comb1,num_units_comb2,
                 act_text1,act_ftr1,act_comb1 
                ):
        super().__init__()
        self.hidden_text1 = nn.Linear(input_size_text,num_units_text1)
        self.act_text1 = act_text1
# #         self.dropout1 = nn.Dropout(0.1)
#         self.hidden_text2 = nn.Linear(num_units_text1,num_units_text2)
#         self.act_text2 = act_text2
        
        self.hidden_ftr1 = nn.Linear(input_size_ftr,num_units_ftr1)
        self.act_ftr1 = act_ftr1
#         self.dropout2 = nn.Dropout(0.1)
        self.hidden_ftr2 = nn.Linear(num_units_ftr1,num_units_ftr2)
        self.act_ftr2 = act_ftr2
        
        self.hidden_comb1 = nn.Linear(num_units_text1 + num_units_ftr2,num_units_comb1) # To update based on layer_num
        self.act_comb1 = act_comb1
        self.dropout3 = nn.Dropout(0.2)
#         self.hidden_comb2 = nn.Linear(num_units_comb1,num_units_comb2)
#         self.act_comb2 = act_comb2        
        self.output = nn.Linear(num_units_comb1,3) # To update based on layer_num
        self.softmax = nn.Softmax(dim=1) # tune
    
    def forward(self,x,y):
        '''
        x: text input
        y: feature input
        '''
        x = self.hidden_text1(x)
        x = self.act_text1(x)
#         x = self.dropout1(x)
#         x = self.hidden_text2(x)
#         x = self.act_text2(x)
        
        y = self.hidden_ftr1(y)
        y = self.act_ftr1(y)
#         y = self.dropout2(y)
        y = self.hidden_ftr2(y)
        y = self.act_ftr2(y)
        
        z = torch.cat((x,y),1)
        z = self.hidden_comb1(z)
        z = self.act_comb1(z)
        z = self.dropout3(z)
#         z = self.hidden_comb2(z)
#         z = self.act_comb2(z)
        z = self.output(z)
        z = self.softmax(z)
        return z

# Set up datasets
# *KIV: scale embeddnigs
X_train_text_chosen = X_train_ft
X_val_text_chosen = X_val_ft

# # Choose datasets based on feature choice
X_train_ftr = scaler.fit_transform(train_df_log[select_cols(end_col_num=15)].to_numpy()) # selected features
X_val_ftr = scaler.fit_transform(val_df_log[select_cols(end_col_num=15)].to_numpy())
X_train_ftr_chosen = X_train_ftr
X_val_ftr_chosen = X_val_ftr

# # Process datasets
input_size_text = X_train_text_chosen.shape[1]
input_size_ftr = X_train_ftr_chosen.shape[1]

X_train_text_nn = convert_tensor(X_train_text_chosen,'float')
X_val_text_nn = convert_tensor(X_val_text_chosen,'float')
X_train_ftr_nn = convert_tensor(X_train_ftr_chosen,'float')
X_val_ftr_nn = convert_tensor(X_val_ftr_chosen,'float')

In [541]:
# Initializing model
# # Parameter experimentation (pre-tuning)
num_units_text1 = 80
num_units_text2 = 60

num_units_ftr1 = 20
num_units_ftr2 = 8

num_units_comb1 = 70
num_units_comb2 = 50

act_text1 = nn.LeakyReLU()
act_text2 = nn.LeakyReLU()

act_ftr1 = nn.LeakyReLU()
act_ftr2 = nn.LeakyReLU()

act_comb1 = nn.LeakyReLU()
act_comb2 = nn.Sigmoid()

# # Initialization
ann_cat = ANN_Cat(
    input_size_text,input_size_ftr,
    num_units_text1,num_units_text2,num_units_ftr1,num_units_ftr2,num_units_comb1,num_units_comb2,
    act_text1,act_ftr1,act_comb1 
)
optimizer_cat = optim.Adam(ann_cat.parameters(),lr=0.0003) # tune
loss_fn = nn.CrossEntropyLoss()
loader_cat = utils.data.DataLoader(utils.data.TensorDataset(X_train_text_nn,X_train_ftr_nn,y_train_nn),batch_size=2)
ann_cat = train_ann_model(ann_cat,'ann_cat4.pt',loss_fn,optimizer_cat,loader_cat,cat=True,
                          X_val_text_nn=X_val_text_nn,X_val_ftr_nn=X_val_ftr_nn)

  0%|          | 0/40 [00:00<?, ?it/s]

[(-1, 4503), (0, 8), (1, 1115)]
Epoch1: val loss=0.7959750890731812, balanced accuracy=0.49462233677261347, precision=0.7175107396212932, recall=0.7500888730892286, fscore=0.7035676057240886
[(-1, 3847), (0, 1092), (1, 687)]
Epoch2: val loss=0.8455769419670105, balanced accuracy=0.5489628916104146, precision=0.734346930489069, recall=0.6919658727337362, fscore=0.6966929476449822
[(-1, 3669), (0, 1471), (1, 486)]
Epoch3: val loss=0.8752524852752686, balanced accuracy=0.5406623922213809, precision=0.7476417431953283, recall=0.6615712762175613, fscore=0.6715190987623922
[(-1, 3489), (0, 1636), (1, 501)]
Epoch4: val loss=0.8942480683326721, balanced accuracy=0.5468919335526835, precision=0.7374648499171755, recall=0.6450408816210451, fscore=0.6592340033745832
[(-1, 3780), (0, 1524), (1, 322)]
Epoch5: val loss=0.8841254711151123, balanced accuracy=0.5267929563501453, precision=0.7634600894700937, recall=0.6564166370423036, fscore=0.6564542105597174
[(-1, 3180), (0, 1950), (1, 496)]
Epoch6: 

In [542]:
# Sense Check
weights_1 = torch.load('ann_cat4.pt')
ann_cat.load_state_dict(weights_1)
y_pred_nn = ann_cat(X_val_text_nn,X_val_ftr_nn)
y_pred = torch.argmax(y_pred_nn,1)
y_pred = y_pred.numpy()
y_pred[y_pred==0] = -1
y_pred[y_pred==1] = 0
y_pred[y_pred==2] = 1
accuracy1 = accuracy_score(y_val,y_pred) # = weighted recall
bal_accuracy1 = balanced_accuracy_score(y_val,y_pred)
print(accuracy1,bal_accuracy1)

0.7500888730892286 0.49462233677261347


KIV: Hyperparameter Tuning

In [60]:
ann_hp = NeuralNetClassifier(
    ANN,
    criterion = nn.CrossEntropyLoss,
    max_epochs = 10,
    optimizer = optim.Adam,
    train_split = None,
    module__input_size = input_size
    
)
params = {
    'optimizer__lr':list(np.linspace(0.001,0.005,5)),
    'module__num_units1':list(np.arange(100,300,50)),
    'module__num_units2':list(np.arange(50,250,50)),
    'module__act1':[nn.ELU(),nn.LeakyReLU(),nn.Sigmoid(),nn.Tanh()],
    'module__act2':[nn.ELU(),nn.LeakyReLU(),nn.Sigmoid(),nn.Tanh()],
    'batch_size': [int(i) for i in list(np.arange(1,5))]
}
gs = GridSearchCV(ann_hp,param_grid=params,cv=3,scoring='balanced_accuracy',verbose=1)
gs.fit(X_train_nn,y_train_nn)

# Issues:
# # Raytune doesn't support Windows
# # GridSearch takes too long

Fitting 3 folds for each of 5120 candidates, totalling 15360 fits
  epoch    train_loss      dur
-------  ------------  -------
      1        [36m0.9027[0m  23.1220
      2        [36m0.9017[0m  22.5110
      3        [36m0.9017[0m  21.9067
      4        0.9017  22.2528
      5        0.9017  22.1930
      6        0.9017  22.9600
      7        0.9017  22.4477
      8        0.9017  23.0299
      9        0.9017  22.7174
     10        0.9017  22.1013
  epoch    train_loss      dur
-------  ------------  -------
      1        [36m0.9013[0m  22.3828
      2        [36m0.9002[0m  22.6431
      3        [36m0.9002[0m  21.7757
      4        0.9002  22.4238
      5        0.9002  22.8037
      6        0.9002  23.3393
      7        0.9002  23.4128
      8        0.9002  22.5803
      9        0.9002  22.8700
     10        0.9002  22.2768
  epoch    train_loss      dur
-------  ------------  -------
      1        [36m0.9054[0m  22.6206
      2        [36m0.9044[0m  22

KeyboardInterrupt: 

## Generate Test Predictions

In [183]:
def generate_test_csv(y_pred,filename):
    '''
    y_pred: array
    '''
    test_predictions = pd.DataFrame(y_pred,columns=['Verdict'])
    test_predictions['Sentence_id'] = test_data['Sentence_id']
    test_predictions.to_csv(filename,index=False)

### Neural Network

In [341]:
test_data = pd.read_csv('test.csv',
                   dtype={'Sentence_id':int,'Text':str,'Verdict':int})
test_data['Text_lower'] = test_data['Text'].apply(lambda x:x.lower())
test_data['Tokens'] = test_data['Text_lower'].apply(word_tokenize)
test_data['Tokens_lem'] = test_data['Tokens'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])
test_data['Tokens_no_stop'] = test_data['Tokens'].map(lambda x: [w for w in x if not w.lower() in stop_words])
test_data['Tokens_lem_no_stop'] = test_data['Tokens_lem'].map(lambda x: [w for w in x if not w.lower() in stop_words])
test_data['Tokens_2'] = test_data['Tokens'].apply(generate_bigram_list)
test_data['POS_tags'] = test_data['Tokens'].apply(generate_pos_list)

In [472]:
nb_all = NaiveBayes(data,'Tokens') # Try with lemmatization and including stop words
nb_all.generate_prob()
all_df_likelihood = nb_all.get_pred(data)
test_df_likelihood = nb_all.get_pred(test_data)

  0%|          | 0/12410 [00:00<?, ?it/s]

  0%|          | 0/22501 [00:00<?, ?it/s]

  0%|          | 0/1032 [00:00<?, ?it/s]

In [475]:
nb_pos_all = NaiveBayes(data,'POS_tags',nb_type='pos')
nb_pos_all.generate_prob()
df_nb_pos_all = nb_pos_all.get_pred(data)
df_nb_pos_test = nb_pos_all.get_pred(test_data)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/22501 [00:00<?, ?it/s]

  0%|          | 0/1032 [00:00<?, ?it/s]

In [346]:
nb2_all = NaiveBayes(data,'Tokens_2',nb_type='bigram')
nb2_all.generate_prob()
df_nb2_all = nb2_all.get_pred(data)
df_nb2_test = nb2_all.get_pred(test_data)

  0%|          | 0/112242 [00:00<?, ?it/s]

  0%|          | 0/22501 [00:00<?, ?it/s]

  0%|          | 0/1032 [00:00<?, ?it/s]

Retrain on all data

In [480]:
# # Prepare all_dataset (required for other models)
features_df = generate_logreg_features(data,all_df_likelihood,df_nb2_all,df_nb_pos_all)
corpus_all = data['Tokens'].tolist()
ft = FastText(corpus_all,vector_size=ft_size,epochs=10)
X_all_ft = generate_ft_emb(data,ft) # word embeddings only
X_all_0 = features_df[select_cols(end_col_num=15)].to_numpy() # features only
X_all_0 = scaler.fit_transform(X_all_0)

  0%|          | 0/22501 [00:00<?, ?it/s]

In [523]:
temp_df = train_df
temp_df = temp_df[['Tokens','Tokens_lem']]
temp_df

Unnamed: 0,Tokens,Tokens_lem
0,"[and, i, want, to, come, back, to, something, ...","[and, i, want, to, come, back, to, something, ..."
1,"[one, of, the, overwhelming, results, that, i,...","[one, of, the, overwhelming, result, that, i, ..."
2,"[i, 'm, confident, that, it, can, be, done, an...","[i, 'm, confident, that, it, can, be, done, an..."
3,"[but, here, 's, the, point, he, misses, .]","[but, here, 's, the, point, he, miss, .]"
4,"[i, mean, ,, this, is, the, president, who, sa...","[i, mean, ,, this, is, the, president, who, sa..."
...,...,...
16870,"[as, the, result, in, the, last, uh, -, two, y...","[a, the, result, in, the, last, uh, -, two, ye..."
16871,"[they, will, know, whether, we, used, those, w...","[they, will, know, whether, we, used, those, w..."
16872,"[i, 'm, awfully, glad, you, ge-, got, that, qu...","[i, 'm, awfully, glad, you, ge-, got, that, qu..."
16873,"[and, china, 's, a, got, a, lot, of, influence...","[and, china, 's, a, got, a, lot, of, influence..."


In [370]:
# # Process datasets for nn

X_text_chosen = X_val_ft
X_ftr_chosen = X_val_ftr
input_size_text = X_text_chosen.shape[1]
input_size_ftr = X_ftr_chosen.shape[1]

X_text_nn = convert_tensor(X_text_chosen,'float')
X_ftr_nn = convert_tensor(X_ftr_chosen,'float')
y_chosen = y_val
y_chosen_nn = convert_tensor(convert_y(y_chosen),'float')

# # Train NN model
loader_cat_final = utils.data.DataLoader(utils.data.TensorDataset(X_text_nn,X_ftr_nn,y_chosen_nn),batch_size=2)
ann_cat_final = train_ann_model(ann_cat,'ann_cat_final.pt',loss_fn,optimizer_cat,loader_cat_final,cat=True,
                          X_val_text_nn=X_val_text_nn,X_val_ftr_nn=X_val_ftr_nn)

  0%|          | 0/40 [00:00<?, ?it/s]

[(-1, 4550), (0, 68), (1, 1008)]
Epoch1: balanced accuracy=0.5046161140765908, precision=0.7258179248346225, recall=0.7488446498400284, fscore=0.7091478627311959
[(-1, 4360), (0, 555), (1, 711)]
Epoch2: balanced accuracy=0.5503606971632847, precision=0.7386383058620923, recall=0.7410238179879133, fscore=0.7232824680507491
[(-1, 4553), (0, 319), (1, 754)]
Epoch3: balanced accuracy=0.5222296344627181, precision=0.7182486405608337, recall=0.7403128332740846, fscore=0.7110298520796047
[(-1, 4596), (0, 81), (1, 949)]
Epoch4: balanced accuracy=0.510236533065175, precision=0.7348399437848964, recall=0.754888019907572, fscore=0.7155310706510672
[(-1, 5173), (0, 28), (1, 425)]
Epoch5: balanced accuracy=0.42198913222926276, precision=0.7106565757599018, recall=0.7163170991823676, fscore=0.6436426005745558
[(-1, 3999), (0, 39), (1, 1588)]
Epoch6: balanced accuracy=0.5348925507328325, precision=0.7315538695818725, recall=0.7468894418769997, fscore=0.7154637438594404
[(-1, 4295), (0, 41), (1, 1290)

In [481]:
# Prepare test dataset (required for other models)
features_test_df = generate_logreg_features(test_data,test_df_likelihood,df_nb2_test,df_nb_pos_test)
# ft.train(test_data['Tokens'].tolist(),total_examples=ft.corpus_count,epochs=10)
X_test_ft = generate_ft_emb(test_data,ft) # word embeddings only
X_test_0 = features_test_df[select_cols(end_col_num=15)].to_numpy() # features only
X_test_0 = scaler.fit_transform(X_test_0)
X_test_text_nn = convert_tensor(X_test_ft,'float')
X_test_ftr_nn = convert_tensor(X_test_0,'float')

  0%|          | 0/1032 [00:00<?, ?it/s]

In [375]:
y_pred_nn = ann_cat(X_test_text_nn,X_test_ftr_nn) #try without training
y_pred = torch.argmax(y_pred_nn,1)
y_pred = y_pred.numpy()
y_pred[y_pred==0] = -1
y_pred[y_pred==1] = 0
y_pred[y_pred==2] = 1
generate_test_csv(y_pred,'test_predictions_ann.csv')

### Naive Bayes

In [470]:
nb_test_pred = nb_all.get_pred(test_data)
y_pred = nb_test_pred['Prediction'].tolist()
generate_test_csv(y_pred,'test_predictions_nb.csv')

  0%|          | 0/1032 [00:00<?, ?it/s]

### Logistic Regression

In [482]:
X_all_01 = features_df[select_cols(end_col_num=15,Bigram=False)].to_numpy()  
X_all_01 = scaler.fit_transform(X_all_01)
X_all_ft1 = np.concatenate((X_all_01,X_all_ft),axis=1)

X_test_01 = features_test_df[select_cols(end_col_num=15,Bigram=False)].to_numpy()
X_test_01 = scaler.fit_transform(X_test_01)
X_test_ft1 = np.concatenate((X_test_01,X_test_ft),axis=1)

logreg_all_clf = LogisticRegression(max_iter=500)
logreg_all_clf.fit(X_all_ft1,y_all)
y_pred = logreg_all_clf.predict(X_test_ft1)
generate_test_csv(y_pred,'test_predictions_LR.csv')

# Best result so far: Bigram 0 POS 1 Bayesian 1