# Word Embeddings + various classification algorithms

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import gensim
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer

import time
import csv
from tqdm import tqdm
from bs4 import BeautifulSoup
import re, string
import nltk
from nltk.corpus import stopwords
import pickle
from tabulate import tabulate

import xgboost as xgb

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [31]:
fake = pd.read_csv('../../data/Fake.csv')
true = pd.read_csv('../../data/True.csv')

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index = True)

df['text'] = df['title'] + " " + df['text']
df.drop(columns=['title', 'date', 'subject'], inplace = True)

In [32]:
%%time
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    text = re.sub('\[[^]]*\]', '', text)
    return re.sub(r'http\S+', '', text)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
    
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

df['text']=df['text'].apply(denoise_text)

[nltk_data] Downloading package stopwords to /home/szymon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 7.64 s, sys: 83.9 ms, total: 7.73 s
Wall time: 7.75 s


---

Reduce dataset for testing purposes

In [33]:
# df_original = df.copy()
# df = df.sample(frac=1).reset_index(drop=True)[:1000]

---

# Embedding

In [34]:
redo_embedding = False # recalculate embeddings
fast = True # True if use reduced dataset (1000 obs) vs. False if full dataset (40000 obs)

In [35]:
%%time
# Load data

if fast:
    df_original = df.copy()
    df = df.sample(frac=1).reset_index(drop=True)[:1000]

if redo_embedding:
    X = df['text'].tolist()
    y = df['label'].tolist()

    with open("X", "wb") as fp:
      pickle.dump(X, fp)
    with open("y", "wb") as fp:
      pickle.dump(y, fp)
elif fast:
    with open("../../data/small/X", "rb") as fp:
      X = pickle.load(fp)
    with open("../../data/small/y", "rb") as fp:
      y = pickle.load(fp)
else:
    with open("../../data/X", "rb") as fp:
      X = pickle.load(fp)
    with open("../../data/y", "rb") as fp:
      y = pickle.load(fp)
    
if redo_embedding:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    with open("X_train", "wb") as fp:
      pickle.dump(X_train, fp)
    with open("X_test", "wb") as fp:
      pickle.dump(X_test, fp)
    with open("y_train", "wb") as fp:
      pickle.dump(y_train, fp)
    with open("y_test", "wb") as fp:
      pickle.dump(y_test, fp)
elif fast:
    with open("../../data/small/X_train", "rb") as fp:
      X_train = pickle.load(fp)
    with open("../../data/small/X_test", "rb") as fp:
      X_test = pickle.load(fp)   
    with open("../../data/small/y_train", "rb") as fp:
      y_train = pickle.load(fp)
    with open("../../data/small/y_test", "rb") as fp:
      y_test = pickle.load(fp)
else:
    with open("../../data/X_train", "rb") as fp:
      X_train = pickle.load(fp)
    with open("../../data/X_test", "rb") as fp:
      X_test = pickle.load(fp)   
    with open("../../data/y_train", "rb") as fp:
      y_train = pickle.load(fp)
    with open("../../data/y_test", "rb") as fp:
      y_test = pickle.load(fp)

CPU times: user 20.3 ms, sys: 13 µs, total: 20.3 ms
Wall time: 19.5 ms


## BERT Embedding

In [36]:
if redo_embedding:
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert.to(device)

    def _get_bert_embedding(text):
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = np.array(input_ids)
        input_ids = np.expand_dims(input_ids, axis=0)
        input_ids = torch.tensor(input_ids).to(device)

        with torch.no_grad():
            outputs = bert(input_ids)
            last_hidden_state = outputs.last_hidden_state
            last_hidden_state = last_hidden_state[:, 0, :].cpu().numpy()

        return last_hidden_state

    print("TRAIN")
    X_train_embeddings = []
    for text in tqdm(X_train):
        embedding = _get_bert_embedding(text)
        X_train_embeddings.append(embedding)
    X_train_embeddings = np.array(X_train_embeddings)
    X_train_embeddings_bert = np.squeeze(X_train_embeddings, axis=1)

    print("TEST")
    X_test_embeddings = []
    for text in tqdm(X_test):
        embedding = _get_bert_embedding(text)
        X_test_embeddings.append(embedding)
    X_test_embeddings = np.array(X_test_embeddings)
    X_test_embeddings_bert = np.squeeze(X_test_embeddings, axis=1)
    
    if fast:
        pd.DataFrame(X_train_embeddings_bert).to_csv("../../data/small/embeddings/X_train_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_bert).to_csv("../../data/small/embeddings/X_test_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_bert).to_csv("../../data/embeddings/X_train_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_bert).to_csv("../../data/embeddings/X_test_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    
elif fast:
    X_train_embeddings_bert = pd.read_csv('../../data/small/embeddings/X_train_embeddings_bert.csv', sep=',', header=None).values
    X_test_embeddings_bert = pd.read_csv('../../data/small/embeddings/X_test_embeddings_bert.csv', sep=',', header=None).values
else:
    X_train_embeddings_bert = pd.read_csv('../../data/embeddings/X_train_embeddings_bert.csv', sep=',', header=None).values
    X_test_embeddings_bert = pd.read_csv('../../data/embeddings/X_test_embeddings_bert.csv', sep=',', header=None).values

## GloVe Embedding

In [37]:
if redo_embedding:

    def load_glove_embeddings(filename):
        embeddings_index = {}
        with open(filename) as f:
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                if len(values[1:]) == 300:
                    coefs = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
        return embeddings_index

    glove_embeddings = load_glove_embeddings('../../glove/glove.840B.300d.txt')

    def text_to_glove_embeddings(text, embeddings_index, embedding_dim):
        embeddings = []
        for sentence in text:
            sentence_embeddings = []
            for word in sentence.split():
                if word in embeddings_index:
                    sentence_embeddings.append(embeddings_index[word])
            if len(sentence_embeddings) > 0:
                embeddings.append(np.mean(sentence_embeddings, axis=0))
            else:
                embeddings.append(np.zeros(embedding_dim))
        return np.array(embeddings)

    X_train_embeddings_glove = text_to_glove_embeddings(X_train, glove_embeddings, embedding_dim=300)
    X_test_embeddings_glove = text_to_glove_embeddings(X_test, glove_embeddings, embedding_dim=300)
    
    if fast:
        pd.DataFrame(X_train_embeddings_glove).to_csv("../../data/small/embeddings/X_train_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_glove).to_csv("../../data/small/embeddings/X_test_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_glove).to_csv("../../data/embeddings/X_train_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_glove).to_csv("../../data/embeddings/X_test_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)

elif fast:
    X_train_embeddings_glove = pd.read_csv('../../data/small/embeddings/X_train_embeddings_glove.csv', sep=',', header=None).values
    X_test_embeddings_glove = pd.read_csv('../../data/small/embeddings/X_test_embeddings_glove.csv', sep=',', header=None).values

else:
    X_train_embeddings_glove = pd.read_csv('../../data/embeddings/X_train_embeddings_glove.csv', sep=',', header=None).values
    X_test_embeddings_glove = pd.read_csv('../../data/embeddings/X_test_embeddings_glove.csv', sep=',', header=None).values

## Word2Vec

In [38]:
if redo_embedding:
    model = gensim.models.KeyedVectors.load_word2vec_format('../../word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True)

def get_word2vec_embeddings(text):
    embeddings = []
    for sentence in tqdm(text):
        tokens = sentence.split()
        doc_vecs = [model[token] for token in tokens if token in model.key_to_index]
        if len(doc_vecs) > 0:
            doc_vec = np.mean(doc_vecs, axis=0)
            embeddings.append(doc_vec)
    return np.array(embeddings)


if redo_embedding:
    # noinspection PyUnboundLocalVariable
    X_train_embeddings_word2vec = get_word2vec_embeddings(X_train)
    X_test_embeddings_word2vec = get_word2vec_embeddings(X_test)

    if fast:
        pd.DataFrame(X_train_embeddings_word2vec).to_csv("../../data/small/embeddings/X_train_embeddings_word2vec_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_word2vec).to_csv("../../data/small/embeddings/X_test_embeddings_word2vec_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_word2vec).to_csv("../../data/embeddings/X_train_embeddings_word2vec_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_word2vec).to_csv("../../data/embeddings/X_test_embeddings_word2vec_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)

elif fast:
    X_train_embeddings_word2vec = pd.read_csv('../../data/small/embeddings/X_train_embeddings_word2vec.csv', sep=',', header=None).values
    X_test_embeddings_word2vec = pd.read_csv('../../data/small/embeddings/X_test_embeddings_word2vec.csv', sep=',', header=None).values

else:
    X_train_embeddings_word2vec = pd.read_csv('../../data/embeddings/X_train_embeddings_word2vec.csv', sep=',', header=None).values
    X_test_embeddings_word2vec = pd.read_csv('../../data/embeddings/X_test_embeddings_word2vec.csv', sep=',', header=None).values

## GPT2

In [39]:
if redo_embedding:
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    gpt2 = GPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gpt2.to(device)

    def _get_gpt2_embedding(text):
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=1024)
        input_ids = torch.tensor(input_ids).unsqueeze(0).to(device).long()

        with torch.no_grad():
            embeddings = gpt2.transformer.wte(input_ids)
            mean_embedding = embeddings.mean(dim=1)
        #     outputs = bert(input_ids)
        #     last_hidden_state = outputs.last_hidden_state
        #     last_hidden_state = last_hidden_state[:, 0, :].cpu().numpy()

            #vector = gpt2.transformer.wte.weight[input_ids,:]
        mean_embedding = mean_embedding.cpu().numpy()
        return mean_embedding

    print("TRAIN")
    X_train_embeddings = []
    for text in tqdm(X_train):
        embedding = _get_gpt2_embedding(text)
        X_train_embeddings.append(embedding)
    X_train_embeddings = np.array(X_train_embeddings)
    X_train_embeddings_gpt2 = np.squeeze(X_train_embeddings, axis=1)

    print("TEST")
    X_test_embeddings = []
    for text in tqdm(X_test):
        embedding = _get_gpt2_embedding(text)
        X_test_embeddings.append(embedding)
    X_test_embeddings = np.array(X_test_embeddings)
    X_test_embeddings_gpt2 = np.squeeze(X_test_embeddings, axis=1)

    if fast:
        pd.DataFrame(X_train_embeddings_gpt2).to_csv("../../data/small/embeddings/X_train_embeddings_gpt2_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_gpt2).to_csv("../../data/small/embeddings/X_test_embeddings_gpt2_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_gpt2).to_csv("../../data/embeddings/X_train_embeddings_gpt2_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_gpt2).to_csv("../../data/embeddings/X_test_embeddings_gpt2_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)

elif fast:
    X_train_embeddings_gpt2 = pd.read_csv('../../data/small/embeddings/X_train_embeddings_gpt2.csv', sep=',', header=None).values
    X_test_embeddings_gpt2 = pd.read_csv('../../data/small/embeddings/X_test_embeddings_gpt2.csv', sep=',', header=None).values
else:
    X_train_embeddings_gpt2 = pd.read_csv('../../data/embeddings/X_train_embeddings_gpt2.csv', sep=',', header=None).values
    X_test_embeddings_gpt2 = pd.read_csv('../../data/embeddings/X_test_embeddings_gpt2.csv', sep=',', header=None).values

## RoBERTa

In [40]:
if redo_embedding:
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    roberta = AutoModel.from_pretrained("roberta-base", output_hidden_states=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    roberta.to(device)

    def _get_roberta_embedding(text):
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = np.array(input_ids)
        input_ids = np.expand_dims(input_ids, axis=0)
        input_ids = torch.tensor(input_ids).to(device)

        with torch.no_grad():
            outputs = roberta(input_ids)
            last_hidden_state = outputs.last_hidden_state
            last_hidden_state = last_hidden_state[:, 0, :].cpu().numpy()

        return last_hidden_state

    print("TRAIN")
    X_train_embeddings = []
    for text in tqdm(X_train):
        embedding = _get_roberta_embedding(text)
        X_train_embeddings.append(embedding)
    X_train_embeddings = np.array(X_train_embeddings)
    X_train_embeddings_roberta = np.squeeze(X_train_embeddings, axis=1)

    print("TEST")
    X_test_embeddings = []
    for text in tqdm(X_test):
        embedding = _get_roberta_embedding(text)
        X_test_embeddings.append(embedding)
    X_test_embeddings = np.array(X_test_embeddings)
    X_test_embeddings_roberta = np.squeeze(X_test_embeddings, axis=1)

    if fast:
        pd.DataFrame(X_train_embeddings_roberta).to_csv("../../data/small/embeddings/X_train_embeddings_roberta_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_roberta).to_csv("../../data/small/embeddings/X_test_embeddings_roberta_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_roberta).to_csv("../../data/embeddings/X_train_embeddings_roberta_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_roberta).to_csv("../../data/embeddings/X_test_embeddings_roberta_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)

elif fast:
    X_train_embeddings_roberta = pd.read_csv('../../data/small/embeddings/X_train_embeddings_roberta.csv', sep=',', header=None).values
    X_test_embeddings_roberta = pd.read_csv('../../data/small/embeddings/X_test_embeddings_roberta.csv', sep=',', header=None).values
else:
    X_train_embeddings_roberta = pd.read_csv('../../data/embeddings/X_train_embeddings_roberta.csv', sep=',', header=None).values
    X_test_embeddings_roberta = pd.read_csv('../../data/embeddings/X_test_embeddings_roberta.csv', sep=',', header=None).values

# Classification

## Wrappers

### KNN

In [41]:
class KNNClassifier:
    def __init__(self, n_neighbors=2, weights='uniform', metric='minkowski'):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric
        self.model = None

    def fit(self, X_train, y_train):
        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)
        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = KNeighborsClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_neighbors = random_search.best_params_['n_neighbors']
        self.weights = random_search.best_params_['weights']
        self.metric = random_search.best_params_['metric']

        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### XGBoost

In [42]:
class XGBoostClassifier:
    def __init__(self, learning_rate=0.1, max_depth=5, min_child_weight=1, subsample=0.5, colsample_bytree=0.5, n_estimators=100, objective='req:squarederror'):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_estimators = n_estimators
        self.objective = objective
        self.model = None

    def fit(self, X_train, y_train):
        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = xgb.XGBClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.learning_rate = random_search.best_params_['learning_rate']
        self.max_depth = random_search.best_params_['max_depth']
        self.min_child_weight = random_search.best_params_['min_child_weight']
        self.subsample = random_search.best_params_['subsample']
        self.colsample_bytree = random_search.best_params_['colsample_bytree']
        self.n_estimators = random_search.best_params_['n_estimators']
        self.objective = random_search.best_params_['objective']

        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### Random Forest

In [43]:
class RFClassifier:
    def __init__(self, n_estimators=100, max_features='sqrt', max_depth='none', bootstrap=True):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.bootstrap = bootstrap
        self.model = None

    def fit(self, X_train, y_train):
        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = RandomForestClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_estimators = random_search.best_params_['n_estimators']
        self.max_features = random_search.best_params_['max_features']
        self.max_depth = random_search.best_params_['max_depth']
        self.bootstrap = random_search.best_params_['bootstrap']

        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### SVC

In [44]:
class SVClassifier:
    def __init__(self, C = 1, kernel='linear', gamma = 0.2):
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.model = None

    def fit(self, X_train, y_train):
        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = svm.SVC()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.C = random_search.best_params_['C']
        self.kernel = random_search.best_params_['kernel']
        self.gamma = random_search.best_params_['gamma']

        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### Logistic Regression

In [45]:
class LRClassifier:
    def __init__(self, penalty = 'l2', solver = 'libinear', C = 0.5):
        self.penalty = penalty
        self.solver = solver
        self.C = C
        self.model = None

    def fit(self, X_train, y_train):
        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = LogisticRegression()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.penalty = random_search.best_params_['penalty']
        self.solver = random_search.best_params_['solver']
        self.C = random_search.best_params_['C']

        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### Neural Network

In [46]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(FakeNewsClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        hidden = self.relu(self.fc1(x))
        output = self.fc2(hidden)
        return output

class NeuralNetworkClassifier:
    def __init__(self, input_dim, hidden_dim, output_dim=2):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.model = None

    def fit(self, X_train, y_train, num_epochs=10, lr=0.001):
        self.model = FakeNewsClassifier(self.input_dim, self.hidden_dim, self.output_dim)
        self.model = self.model.double()

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=lr)

        for _ in tqdm(range(num_epochs), desc="Epoch"):
            for embedding, label in zip(X_train, y_train):
                embedding_tensor = torch.from_numpy(embedding).double().unsqueeze(0)
                label_tensor = torch.tensor([label])
                optimizer.zero_grad()
                outputs = self.model(embedding_tensor)
                loss = criterion(outputs, label_tensor)
                loss.backward()
                optimizer.step()

    def predict(self, X_test):
        if self.model is None:
            raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        test_inputs = torch.from_numpy(X_test).double()
        predictions = self.model(test_inputs)
        predicted_classes = torch.argmax(predictions, dim=1)
        predicted_classes = predicted_classes.numpy()
        return predicted_classes

    def evaluate(self, X_test, y_test):
        if self.model is None:
            raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100, 1)
        f1 = round(f1_score(y_test, y_pred)*100, 1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

## BERT embeddings

### BERT + KNN

In [47]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_bert_train, accuracy_knn_bert_train, f1_knn_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_knn_bert_test, accuracy_knn_bert_test, f1_knn_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=minkowski, n_neighbors=7, weights=uniform;, score=0.931 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=7, weights=uniform;, score=0.975 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=7, weights=uniform;, score=0.950 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=7, weights=uniform;, score=0.981 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=7, weights=uniform;, score=0.950 total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=6, weights=uniform;, score=0.938 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=6, weights=uniform;, score=0.963 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=6, weights=uniform;, score=0.938 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=6, weights=uniform;, score=0.981 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=6, weights=uniform;, score=0.969 total t

### BERT + XGBoost

In [48]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_bert_train, accuracy_xgb_bert_train, f1_xgb_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_xgb_bert_test, accuracy_xgb_bert_test, f1_xgb_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=7, min_child_weight=5, n_estimators=100, objective=reg:squarederror, subsample=0.7;, score=0.912 total time=   0.6s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=7, min_child_weight=5, n_estimators=100, objective=reg:squarederror, subsample=0.7;, score=0.975 total time=   0.6s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=7, min_child_weight=5, n_estimators=100, objective=reg:squarederror, subsample=0.7;, score=0.919 total time=   0.6s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=7, min_child_weight=5, n_estimators=100, objective=reg:squarederror, subsample=0.7;, score=0.963 total time=   0.6s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=7, min_child_weight=5, n_estimators=100, objective=reg:squarederror, subsample=0.7;, score=0.938 total time=   0.6s
[CV 1/5] END colsample_bytree=0.5, lea

### BERT + Random Forest

In [49]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25], 
    'max_features': [5, 10],
    'max_depth': [10, 50, None], 
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_bert_train, accuracy_rf_bert_train, f1_rf_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_rf_bert_test, accuracy_rf_bert_test, f1_rf_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=10;, score=0.919 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=10;, score=0.938 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=10;, score=0.938 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=10;, score=0.956 total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=10;, score=0.938 total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=25;, score=0.938 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=25;, score=0.969 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=10, max_features=10, n_estimators=25;, score=0.944 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### BERT + SVC

In [50]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_bert_train, accuracy_svc_bert_train, f1_svc_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_svc_bert_test, accuracy_svc_bert_test, f1_svc_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.963 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.969 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.969 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.988 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.956 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.950 total time=   0.0s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.981 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.0s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.963 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.956 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### BERT + Logistic Regression

In [51]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_bert_train, accuracy_lr_bert_train, f1_lr_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_lr_bert_test, accuracy_lr_bert_test, f1_lr_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV 1/5] END .C=0.42, penalty=none, solver=saga;, score=0.963 total time=   0.4s




[CV 2/5] END .C=0.42, penalty=none, solver=saga;, score=0.963 total time=   0.4s




[CV 3/5] END .C=0.42, penalty=none, solver=saga;, score=0.963 total time=   0.5s




[CV 4/5] END .C=0.42, penalty=none, solver=saga;, score=0.994 total time=   0.4s




[CV 5/5] END .C=0.42, penalty=none, solver=saga;, score=0.963 total time=   0.4s
[CV 1/5] END C=0.62, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.62, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.62, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.62, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.62, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END C=0.68, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END C=0.68, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END C=0.68, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END C=0.68, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END C=0.68, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s




[CV 1/5] END ...C=0.32, penalty=l2, solver=saga;, score=0.950 total time=   0.4s




[CV 2/5] END ...C=0.32, penalty=l2, solver=saga;, score=0.975 total time=   0.4s




[CV 3/5] END ...C=0.32, penalty=l2, solver=saga;, score=0.963 total time=   0.4s




[CV 4/5] END ...C=0.32, penalty=l2, solver=saga;, score=0.994 total time=   0.4s




[CV 5/5] END ...C=0.32, penalty=l2, solver=saga;, score=0.950 total time=   0.4s
[CV 1/5] END C=0.53, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.53, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.53, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.53, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.53, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s




[CV 1/5] END C=0.5700000000000001, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 2/5] END C=0.5700000000000001, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END C=0.5700000000000001, penalty=none, solver=sag;, score=0.963 total time=   0.4s




[CV 4/5] END C=0.5700000000000001, penalty=none, solver=sag;, score=0.994 total time=   0.4s




[CV 5/5] END C=0.5700000000000001, penalty=none, solver=sag;, score=0.956 total time=   0.3s




[CV 1/5] END .C=0.27, penalty=none, solver=saga;, score=0.969 total time=   0.5s




[CV 2/5] END .C=0.27, penalty=none, solver=saga;, score=0.963 total time=   0.4s




[CV 3/5] END .C=0.27, penalty=none, solver=saga;, score=0.963 total time=   0.4s




[CV 4/5] END .C=0.27, penalty=none, solver=saga;, score=0.994 total time=   0.4s




[CV 5/5] END .C=0.27, penalty=none, solver=saga;, score=0.956 total time=   0.4s
[CV 1/5] END C=0.88, penalty=l1, solver=liblinear;, score=0.944 total time=   0.1s
[CV 2/5] END C=0.88, penalty=l1, solver=liblinear;, score=0.975 total time=   0.1s
[CV 3/5] END C=0.88, penalty=l1, solver=liblinear;, score=0.950 total time=   0.1s
[CV 4/5] END C=0.88, penalty=l1, solver=liblinear;, score=0.981 total time=   0.1s
[CV 5/5] END C=0.88, penalty=l1, solver=liblinear;, score=0.938 total time=   0.1s
[CV 1/5] END C=0.36, penalty=none, solver=lbfgs;, score=0.950 total time=   0.0s
[CV 2/5] END C=0.36, penalty=none, solver=lbfgs;, score=0.963 total time=   0.0s




[CV 3/5] END C=0.36, penalty=none, solver=lbfgs;, score=0.963 total time=   0.0s
[CV 4/5] END C=0.36, penalty=none, solver=lbfgs;, score=0.988 total time=   0.0s
[CV 5/5] END C=0.36, penalty=none, solver=lbfgs;, score=0.963 total time=   0.0s




[CV 1/5] END ...C=0.91, penalty=l1, solver=saga;, score=0.950 total time=   0.7s




[CV 2/5] END ...C=0.91, penalty=l1, solver=saga;, score=0.969 total time=   0.7s




[CV 3/5] END ...C=0.91, penalty=l1, solver=saga;, score=0.956 total time=   0.9s




[CV 4/5] END ...C=0.91, penalty=l1, solver=saga;, score=0.994 total time=   1.0s




[CV 5/5] END ...C=0.91, penalty=l1, solver=saga;, score=0.956 total time=   1.0s
[CV 1/5] END C=0.6900000000000001, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.6900000000000001, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.6900000000000001, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.6900000000000001, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.6900000000000001, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 1/5] END C=0.89, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.89, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.89, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.89, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.89, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s




[CV 1/5] END ...C=0.84, penalty=l2, solver=saga;, score=0.969 total time=   0.5s




[CV 2/5] END ...C=0.84, penalty=l2, solver=saga;, score=0.963 total time=   0.6s




[CV 3/5] END ...C=0.84, penalty=l2, solver=saga;, score=0.963 total time=   0.6s




[CV 4/5] END ...C=0.84, penalty=l2, solver=saga;, score=0.994 total time=   0.6s




[CV 5/5] END ...C=0.84, penalty=l2, solver=saga;, score=0.950 total time=   0.7s
[CV 1/5] END C=0.5700000000000001, penalty=l1, solver=liblinear;, score=0.938 total time=   0.1s
[CV 2/5] END C=0.5700000000000001, penalty=l1, solver=liblinear;, score=0.975 total time=   0.1s
[CV 3/5] END C=0.5700000000000001, penalty=l1, solver=liblinear;, score=0.950 total time=   0.1s
[CV 4/5] END C=0.5700000000000001, penalty=l1, solver=liblinear;, score=0.981 total time=   0.1s
[CV 5/5] END C=0.5700000000000001, penalty=l1, solver=liblinear;, score=0.938 total time=   0.0s
[CV 1/5] END C=0.8200000000000001, penalty=none, solver=newton-cg;, score=0.975 total time=   0.1s




[CV 2/5] END C=0.8200000000000001, penalty=none, solver=newton-cg;, score=0.969 total time=   0.1s
[CV 3/5] END C=0.8200000000000001, penalty=none, solver=newton-cg;, score=0.963 total time=   0.1s
[CV 4/5] END C=0.8200000000000001, penalty=none, solver=newton-cg;, score=0.994 total time=   0.1s




[CV 5/5] END C=0.8200000000000001, penalty=none, solver=newton-cg;, score=0.956 total time=   0.1s
[CV 1/5] END ..C=0.32, penalty=l2, solver=lbfgs;, score=0.950 total time=   0.1s
[CV 2/5] END ..C=0.32, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.1s
[CV 3/5] END ..C=0.32, penalty=l2, solver=lbfgs;, score=0.963 total time=   0.0s
[CV 4/5] END ..C=0.32, penalty=l2, solver=lbfgs;, score=0.994 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END ..C=0.32, penalty=l2, solver=lbfgs;, score=0.950 total time=   0.1s
[CV 1/5] END C=0.36, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.36, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.36, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.36, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.36, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s




[CV 1/5] END ..C=0.39, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 2/5] END ..C=0.39, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END ..C=0.39, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 4/5] END ..C=0.39, penalty=none, solver=sag;, score=0.994 total time=   0.4s


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

--

[CV 5/5] END ..C=0.39, penalty=none, solver=sag;, score=0.956 total time=   0.4s
[CV 1/5] END C=0.18, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.18, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.18, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.18, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.18, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END C=0.87, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.87, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.87, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.87, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.87, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s



__________________________

### BERT + Neural Network

In [52]:
input_dim = X_train_embeddings_bert.shape[1]  # Dimensionality of word embeddings
hidden_dim = X_train_embeddings_bert.shape[1]*2  # Number of units in the hidden layer

classifier = NeuralNetworkClassifier(input_dim, hidden_dim)

# Fit the model
classifier.fit(X_train_embeddings_bert, y_train, num_epochs=10, lr=0.001)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_nn_bert_train, accuracy_nn_bert_train, f1_nn_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_nn_bert_test, accuracy_nn_bert_test, f1_nn_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Epoch: 100%|██████████| 10/10 [01:48<00:00, 10.83s/it]




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[419   7]
 [  0 374]] 

Accuracy: 99.1 

F1 Score: 99.1 


----- TEST -----

Confusion matrix
 [[99  8]
 [ 1 92]] 

Accuracy: 95.5 

F1 Score: 95.3 

_______________________________________________________________________





## GloVe embeddings

### GloVe + KNN

In [53]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_glove_train, accuracy_knn_glove_train, f1_knn_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_knn_glove_test, accuracy_knn_glove_test, f1_knn_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.844 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.881 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.812 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.912 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.906 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=8, weights=uniform;, score=0.869 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=8, weights=uniform;, score=0.906 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=8, weights=uniform;, score=0.869 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=8, weights=uniform;, score=0.938 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=8, weights=uniform;, score=0.906 total t

### GloVe + XGBoost

In [54]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_glove_train, accuracy_xgb_glove_train, f1_xgb_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_xgb_glove_test, accuracy_xgb_glove_test, f1_xgb_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, objective=reg:squarederror, subsample=0.7;, score=0.881 total time=   0.1s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, objective=reg:squarederror, subsample=0.7;, score=0.875 total time=   0.1s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, objective=reg:squarederror, subsample=0.7;, score=0.900 total time=   0.1s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, objective=reg:squarederror, subsample=0.7;, score=0.938 total time=   0.1s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, objective=reg:squarederror, subsample=0.7;, score=0.863 total time=   0.1s
[CV 1/5] END colsample_bytree=0.5, lea

### GloVe + Random Forest

In [55]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25], 
    'max_features': [5, 10],
    'max_depth': [10, 50, None], 
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_glove_train, accuracy_rf_glove_train, f1_rf_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_rf_glove_test, accuracy_rf_glove_test, f1_rf_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.906 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.894 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.875 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.912 total time=   0.1s
[CV 5/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.906 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.887 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.906 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.875 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_feat

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### GloVe + SVC

In [56]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_glove_train, accuracy_svc_glove_train, f1_svc_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_svc_glove_test, accuracy_svc_glove_test, f1_svc_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.944 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.963 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.944 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.956 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.912 total time=   0.0s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.963 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.925 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.925 total time=   0.0s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### GloVe + Logistic Regression

In [57]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_glove_train, accuracy_lr_glove_train, f1_lr_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_lr_glove_test, accuracy_lr_glove_test, f1_lr_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END C=0.32, penalty=l2, solver=newton-cg;, score=0.894 total time=   0.0s
[CV 2/5] END C=0.32, penalty=l2, solver=newton-cg;, score=0.938 total time=   0.0s
[CV 3/5] END C=0.32, penalty=l2, solver=newton-cg;, score=0.931 total time=   0.0s
[CV 4/5] END C=0.32, penalty=l2, solver=newton-cg;, score=0.969 total time=   0.0s
[CV 5/5] END C=0.32, penalty=l2, solver=newton-cg;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=0.21, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=0.21, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ......C=0.21, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ......C=0.21, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ......C=0.21, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.51, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.51, penalty=none, 



[CV 1/5] END ....C=0.5, penalty=l1, solver=saga;, score=0.875 total time=   0.3s




[CV 2/5] END ....C=0.5, penalty=l1, solver=saga;, score=0.875 total time=   0.3s




[CV 3/5] END ....C=0.5, penalty=l1, solver=saga;, score=0.931 total time=   0.2s




[CV 4/5] END ....C=0.5, penalty=l1, solver=saga;, score=0.944 total time=   0.3s




[CV 5/5] END ....C=0.5, penalty=l1, solver=saga;, score=0.919 total time=   0.2s
[CV 1/5] END C=0.4, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.4, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.4, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.4, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.4, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 1/5] END C=0.38, penalty=l1, solver=liblinear;, score=0.850 total time=   0.0s
[CV 2/5] END C=0.38, penalty=l1, solver=liblinear;, score=0.850 total time=   0.0s
[CV 3/5] END C=0.38, penalty=l1, solver=liblinear;, score=0.919 total time=   0.0s
[CV 4/5] END C=0.38, penalty=l1, solver=liblinear;, score=0.931 total time=   0.0s
[CV 5/5] END C=0.38, penalty=l1, solver=liblinear;, score=0.906 total time=   0.0s
[CV 1/5] END ...C=0.12, penalty=l2, solver=saga;, score=0.869 total time=   0.1s
[CV 2/5]



[CV 2/5] END ..C=0.51, penalty=none, solver=sag;, score=0.963 total time=   0.1s
[CV 3/5] END ..C=0.51, penalty=none, solver=sag;, score=0.950 total time=   0.1s




[CV 4/5] END ..C=0.51, penalty=none, solver=sag;, score=0.994 total time=   0.1s
[CV 5/5] END ..C=0.51, penalty=none, solver=sag;, score=0.963 total time=   0.1s
[CV 1/5] END ....C=0.89, penalty=l2, solver=sag;, score=0.906 total time=   0.1s
[CV 2/5] END ....C=0.89, penalty=l2, solver=sag;, score=0.950 total time=   0.1s
[CV 3/5] END ....C=0.89, penalty=l2, solver=sag;, score=0.950 total time=   0.1s
[CV 4/5] END ....C=0.89, penalty=l2, solver=sag;, score=0.981 total time=   0.1s
[CV 5/5] END ....C=0.89, penalty=l2, solver=sag;, score=0.956 total time=   0.1s
[CV 1/5] END C=0.56, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.56, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.56, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.56, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.56, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.6



[CV 1/5] END ...C=0.5, penalty=none, solver=sag;, score=0.944 total time=   0.1s
[CV 2/5] END ...C=0.5, penalty=none, solver=sag;, score=0.963 total time=   0.1s




[CV 3/5] END ...C=0.5, penalty=none, solver=sag;, score=0.950 total time=   0.2s
[CV 4/5] END ...C=0.5, penalty=none, solver=sag;, score=0.994 total time=   0.2s




[CV 5/5] END ...C=0.5, penalty=none, solver=sag;, score=0.956 total time=   0.1s
[CV 1/5] END ..C=0.85, penalty=l2, solver=lbfgs;, score=0.906 total time=   0.0s
[CV 2/5] END ..C=0.85, penalty=l2, solver=lbfgs;, score=0.950 total time=   0.0s
[CV 3/5] END ..C=0.85, penalty=l2, solver=lbfgs;, score=0.950 total time=   0.0s
[CV 4/5] END ..C=0.85, penalty=l2, solver=lbfgs;, score=0.981 total time=   0.0s
[CV 5/5] END ..C=0.85, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 1/5] END ..C=0.59, penalty=l2, solver=lbfgs;, score=0.900 total time=   0.0s
[CV 2/5] END ..C=0.59, penalty=l2, solver=lbfgs;, score=0.944 total time=   0.0s
[CV 3/5] END ..C=0.59, penalty=l2, solver=lbfgs;, score=0.938 total time=   0.0s
[CV 4/5] END ..C=0.59, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.0s
[CV 5/5] END ..C=0.59, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 1/5] END ......C=0.86, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=0.86, p

35 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

-----




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[424   2]
 [  0 374]] 

Accuracy: 99.8 

F1 Score: 99.7 


----- TEST -----

Confusion matrix
 [[105   2]
 [  1  92]] 

Accuracy: 98.5 

F1 Score: 98.4 

_______________________________________________________________________




### GloVe + Neural Network

In [58]:
input_dim = X_train_embeddings_glove.shape[1]  # Dimensionality of word embeddings
hidden_dim = X_train_embeddings_glove.shape[1]*2  # Number of units in the hidden layer

classifier = NeuralNetworkClassifier(input_dim, hidden_dim)

# Fit the model
classifier.fit(X_train_embeddings_glove, y_train, num_epochs=10, lr=0.001)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_nn_glove_train, accuracy_nn_glove_train, f1_nn_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_nn_glove_test, accuracy_nn_glove_test, f1_nn_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Epoch: 100%|██████████| 10/10 [00:14<00:00,  1.43s/it]




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[426   0]
 [ 35 339]] 

Accuracy: 95.6 

F1 Score: 95.1 


----- TEST -----

Confusion matrix
 [[105   2]
 [ 13  80]] 

Accuracy: 92.5 

F1 Score: 91.4 

_______________________________________________________________________





## Word2Vec

### Word2Vec + KNN

In [59]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_word2vec, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_word2vec, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_w2v_train, accuracy_knn_w2v_train, f1_knn_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_knn_w2v_test, accuracy_knn_w2v_test, f1_knn_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=euclidean, n_neighbors=7, weights=uniform;, score=0.850 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=7, weights=uniform;, score=0.887 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=7, weights=uniform;, score=0.894 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=7, weights=uniform;, score=0.919 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=7, weights=uniform;, score=0.919 total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.887 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.894 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.894 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.938 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.912 to

### Word2Vec + XGBoost

In [60]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_word2vec, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_word2vec, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_w2v_train, accuracy_xgb_w2v_train, f1_xgb_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_xgb_w2v_test, accuracy_xgb_w2v_test, f1_xgb_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=7, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.844 total time=   0.6s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=7, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.875 total time=   0.4s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=7, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.825 total time=   0.4s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=7, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.881 total time=   0.5s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=7, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.894 total time=   0.4s
[CV 1/5] END colsample_bytree=0.7, lea

### Word2Vec + Random Forest

In [61]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25],
    'max_features': [5, 10],
    'max_depth': [10, 50, None],
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_word2vec, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_word2vec, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_w2v_train, accuracy_rf_w2v_train, f1_rf_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_rf_w2v_test, accuracy_rf_w2v_test, f1_rf_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=50, max_features=5, n_estimators=10;, score=0.838 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=5, n_estimators=10;, score=0.931 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=50, max_features=5, n_estimators=10;, score=0.869 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=50, max_features=5, n_estimators=10;, score=0.850 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=50, max_features=5, n_estimators=10;, score=0.887 total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=5, n_estimators=25;, score=0.900 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=5, n_estimators=25;, score=0.919 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=5, n_estimators=25;, score=0.887 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=5, n_estimators=25

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### Word2Vec + SVC

In [62]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_word2vec, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_word2vec, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_w2v_train, accuracy_svc_w2v_train, f1_svc_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_svc_w2v_test, accuracy_svc_w2v_test, f1_svc_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.925 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.956 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.950 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.981 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.956 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.950 total time=   0.0s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.944 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.0s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.944 total time=   0.0s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### Word2Vec + Logistic Regression

In [63]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_word2vec, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_word2vec, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_w2v_train, accuracy_lr_w2v_train, f1_lr_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_lr_w2v_test, accuracy_lr_w2v_test, f1_lr_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ....C=0.29, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=0.29, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ....C=0.29, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ....C=0.29, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ....C=0.29, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.26, penalty=none, solver=lbfgs;, score=0.944 total time=   0.0s
[CV 2/5] END C=0.26, penalty=none, solver=lbfgs;, score=0.975 total time=   0.0s
[CV 3/5] END C=0.26, penalty=none, solver=lbfgs;, score=0.963 total time=   0.0s
[CV 4/5] END C=0.26, penalty=none, solver=lbfgs;, score=0.988 total time=   0.0s
[CV 5/5] END C=0.26, penalty=none, solver=lbfgs;, score=0.969 total time=   0.0s
[CV 1/5] END C=0.08, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.08, penalty=elasticnet



[CV 3/5] END C=0.41000000000000003, penalty=l2, solver=newton-cg;, score=0.931 total time=   0.0s
[CV 4/5] END C=0.41000000000000003, penalty=l2, solver=newton-cg;, score=0.944 total time=   0.0s
[CV 5/5] END C=0.41000000000000003, penalty=l2, solver=newton-cg;, score=0.944 total time=   0.0s
[CV 1/5] END C=0.6900000000000001, penalty=l2, solver=liblinear;, score=0.912 total time=   0.0s
[CV 2/5] END C=0.6900000000000001, penalty=l2, solver=liblinear;, score=0.944 total time=   0.0s
[CV 3/5] END C=0.6900000000000001, penalty=l2, solver=liblinear;, score=0.931 total time=   0.0s
[CV 4/5] END C=0.6900000000000001, penalty=l2, solver=liblinear;, score=0.950 total time=   0.0s
[CV 5/5] END C=0.6900000000000001, penalty=l2, solver=liblinear;, score=0.938 total time=   0.0s
[CV 1/5] END C=0.59, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END C=0.59, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END C=0.59, penalty=elasticnet, solver=lbf



[CV 5/5] END C=0.89, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ..C=0.97, penalty=none, solver=sag;, score=0.963 total time=   0.1s




[CV 2/5] END ..C=0.97, penalty=none, solver=sag;, score=0.969 total time=   0.1s
[CV 3/5] END ..C=0.97, penalty=none, solver=sag;, score=0.963 total time=   0.1s




[CV 4/5] END ..C=0.97, penalty=none, solver=sag;, score=1.000 total time=   0.1s
[CV 5/5] END ..C=0.97, penalty=none, solver=sag;, score=0.963 total time=   0.1s
[CV 1/5] END C=0.9500000000000001, penalty=l2, solver=sag;, score=0.912 total time=   0.0s




[CV 2/5] END C=0.9500000000000001, penalty=l2, solver=sag;, score=0.944 total time=   0.0s
[CV 3/5] END C=0.9500000000000001, penalty=l2, solver=sag;, score=0.931 total time=   0.0s
[CV 4/5] END C=0.9500000000000001, penalty=l2, solver=sag;, score=0.956 total time=   0.0s
[CV 5/5] END C=0.9500000000000001, penalty=l2, solver=sag;, score=0.950 total time=   0.0s
[CV 1/5] END C=0.8300000000000001, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.8300000000000001, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.8300000000000001, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.8300000000000001, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.8300000000000001, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END C=0.48, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.48, penalty=elasticnet, solver=liblinear;, sc



[CV 1/5] END ....C=0.2, penalty=l1, solver=saga;, score=0.825 total time=   0.2s




[CV 2/5] END ....C=0.2, penalty=l1, solver=saga;, score=0.881 total time=   0.3s




[CV 3/5] END ....C=0.2, penalty=l1, solver=saga;, score=0.838 total time=   0.2s
[CV 4/5] END ....C=0.2, penalty=l1, solver=saga;, score=0.887 total time=   0.2s


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

----

[CV 5/5] END ....C=0.2, penalty=l1, solver=saga;, score=0.856 total time=   0.2s
[CV 1/5] END ......C=0.34, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=0.34, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ......C=0.34, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ......C=0.34, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ......C=0.34, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.81, penalty=none, solver=newton-cg;, score=0.956 total time=   0.0s
[CV 2/5] END C=0.81, penalty=none, solver=newton-cg;, score=0.969 total time=   0.0s
[CV 3/5] END C=0.81, penalty=none, solver=newton-cg;, score=0.944 total time=   0.0s
[CV 4/5] END C=0.81, penalty=none, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.81, penalty=none, solver=newton-cg;, score=0.969 total time=   0.0s



_______________________________________________________________________
EVALUATION

--



### Word2Vec + Neural Network

In [64]:
input_dim = X_train_embeddings_word2vec.shape[1]  # Dimensionality of word embeddings
hidden_dim = X_train_embeddings_word2vec.shape[1]*2  # Number of units in the hidden layer

classifier = NeuralNetworkClassifier(input_dim, hidden_dim)

# Fit the model
classifier.fit(X_train_embeddings_word2vec, y_train, num_epochs=10, lr=0.001)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_nn_w2v_train, accuracy_nn_w2v_train, f1_nn_w2v_train = classifier.evaluate(X_train_embeddings_word2vec, y_train)

print('\n----- TEST -----')
cm_nn_w2v_test, accuracy_nn_w2v_test, f1_nn_w2v_test = classifier.evaluate(X_test_embeddings_word2vec, y_test)
print('_______________________________________________________________________')

Epoch: 100%|██████████| 10/10 [00:15<00:00,  1.59s/it]




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[426   0]
 [  2 372]] 

Accuracy: 99.8 

F1 Score: 99.7 


----- TEST -----

Confusion matrix
 [[106   1]
 [  4  89]] 

Accuracy: 97.5 

F1 Score: 97.3 

_______________________________________________________________________





## GPT2

### GPT2 + KNN

In [65]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_gpt2, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_gpt2, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_gpt2_train, accuracy_knn_gpt2_train, f1_knn_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_knn_gpt2_test, accuracy_knn_gpt2_test, f1_knn_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=euclidean, n_neighbors=8, weights=uniform;, score=0.869 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=8, weights=uniform;, score=0.931 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=8, weights=uniform;, score=0.944 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=8, weights=uniform;, score=0.975 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=8, weights=uniform;, score=0.944 total time=   0.0s
[CV 1/5] END metric=manhattan, n_neighbors=8, weights=distance;, score=0.881 total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=8, weights=distance;, score=0.925 total time=   0.0s
[CV 3/5] END metric=manhattan, n_neighbors=8, weights=distance;, score=0.950 total time=   0.0s
[CV 4/5] END metric=manhattan, n_neighbors=8, weights=distance;, score=0.969 total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=8, weights=distance;, score=0.944 to

### GPT2 + XGBoost

In [66]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_gpt2, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_gpt2, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_gpt2_train, accuracy_xgb_gpt2_train, f1_xgb_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_xgb_gpt2_test, accuracy_xgb_gpt2_test, f1_xgb_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.7;, score=0.863 total time=   2.7s
[CV 2/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.7;, score=0.900 total time=   2.8s
[CV 3/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.7;, score=0.881 total time=   0.7s
[CV 4/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.7;, score=0.931 total time=   0.7s
[CV 5/5] END colsample_bytree=0.7, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200, objective=reg:squarederror, subsample=0.7;, score=0.906 total time=   0.7s
[CV 1/5] END colsample_bytree=0.7, lea

### GPT2 + Random Forest

In [67]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25],
    'max_features': [5, 10],
    'max_depth': [10, 50, None],
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_gpt2, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_gpt2, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_gpt2_train, accuracy_rf_gpt2_train, f1_rf_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_rf_gpt2_test, accuracy_rf_gpt2_test, f1_rf_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.912 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.931 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.950 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.956 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=5, n_estimators=25;, score=0.956 total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=10;, score=0.887 total time=   0.0s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=10;, score=0.906 total time=   0.0s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=10;, score=0.919 total time=   0.0s
[CV 4/5] END bootstrap=False, max_depth=None, max_featu

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### GPT2 + SVC

In [68]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_gpt2, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_gpt2, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_gpt2_train, accuracy_svc_gpt2_train, f1_svc_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_svc_gpt2_test, accuracy_svc_gpt2_test, f1_svc_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.944 total time=   0.1s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.994 total time=   0.1s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.981 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.994 total time=   0.1s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.956 total time=   0.1s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.944 total time=   0.1s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.1s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.981 total time=   0.1s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.1s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.1s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.931 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### GPT2 + Logistic Regression

In [69]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_gpt2, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_gpt2, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_gpt2_train, accuracy_lr_gpt2_train, f1_lr_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_lr_gpt2_test, accuracy_lr_gpt2_test, f1_lr_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END C=0.75, penalty=none, solver=newton-cg;, score=0.969 total time=   0.1s
[CV 2/5] END C=0.75, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s
[CV 3/5] END C=0.75, penalty=none, solver=newton-cg;, score=0.975 total time=   0.1s




[CV 4/5] END C=0.75, penalty=none, solver=newton-cg;, score=0.988 total time=   0.1s
[CV 5/5] END C=0.75, penalty=none, solver=newton-cg;, score=0.981 total time=   0.1s
[CV 1/5] END ....C=0.08, penalty=l2, solver=sag;, score=0.869 total time=   0.1s
[CV 2/5] END ....C=0.08, penalty=l2, solver=sag;, score=0.881 total time=   0.1s




[CV 3/5] END ....C=0.08, penalty=l2, solver=sag;, score=0.887 total time=   0.0s
[CV 4/5] END ....C=0.08, penalty=l2, solver=sag;, score=0.881 total time=   0.1s
[CV 5/5] END ....C=0.08, penalty=l2, solver=sag;, score=0.875 total time=   0.1s
[CV 1/5] END C=0.35000000000000003, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.35000000000000003, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.35000000000000003, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.35000000000000003, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.35000000000000003, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.01, penalty=l2, solver=newton-cg;, score=0.531 total time=   0.0s
[CV 2/5] END C=0.01, penalty=l2, solver=newton-cg;, score=0.531 total time=   0.0s
[CV 3/5] END C=0.01, penalty=l2, solver=newton-cg;, score=0.531 total time=   0.0s
[CV 4/5] END C=0.01, penalty=l2, solver=newton-cg;, score=



[CV 1/5] END ..C=0.53, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 2/5] END ..C=0.53, penalty=none, solver=sag;, score=1.000 total time=   0.4s




[CV 3/5] END ..C=0.53, penalty=none, solver=sag;, score=0.975 total time=   0.4s




[CV 4/5] END ..C=0.53, penalty=none, solver=sag;, score=0.994 total time=   0.4s




[CV 5/5] END ..C=0.53, penalty=none, solver=sag;, score=0.975 total time=   0.4s
[CV 1/5] END C=0.09, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.09, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.09, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.09, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.09, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s




[CV 1/5] END .C=0.52, penalty=none, solver=saga;, score=0.975 total time=   0.5s




[CV 2/5] END .C=0.52, penalty=none, solver=saga;, score=1.000 total time=   0.5s




[CV 3/5] END .C=0.52, penalty=none, solver=saga;, score=0.981 total time=   0.5s




[CV 4/5] END .C=0.52, penalty=none, solver=saga;, score=0.994 total time=   0.5s




[CV 5/5] END .C=0.52, penalty=none, solver=saga;, score=0.975 total time=   0.5s




[CV 1/5] END .C=0.71, penalty=none, solver=saga;, score=0.975 total time=   0.5s




[CV 2/5] END .C=0.71, penalty=none, solver=saga;, score=1.000 total time=   0.5s




[CV 3/5] END .C=0.71, penalty=none, solver=saga;, score=0.981 total time=   0.5s




[CV 4/5] END .C=0.71, penalty=none, solver=saga;, score=0.994 total time=   0.5s




[CV 5/5] END .C=0.71, penalty=none, solver=saga;, score=0.975 total time=   0.5s
[CV 1/5] END C=0.61, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.61, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.61, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.61, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.61, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.14, penalty=l1, solver=saga;, score=0.575 total time=   0.4s
[CV 2/5] END ...C=0.14, penalty=l1, solver=saga;, score=0.656 total time=   0.3s
[CV 3/5] END ...C=0.14, penalty=l1, solver=saga;, score=0.537 total time=   0.4s
[CV 4/5] END ...C=0.14, penalty=l1, solver=saga;, score=0.550 total time=   0.4s
[CV 5/5] END ...C=0.14, penalty=l1, solver=saga;, score=0.594 total time=   0.4s
[CV 1/5] END C=0.46, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.4



[CV 1/5] END ...C=0.33, penalty=l1, solver=saga;, score=0.756 total time=   0.6s




[CV 2/5] END ...C=0.33, penalty=l1, solver=saga;, score=0.731 total time=   0.6s




[CV 3/5] END ...C=0.33, penalty=l1, solver=saga;, score=0.844 total time=   0.7s




[CV 4/5] END ...C=0.33, penalty=l1, solver=saga;, score=0.787 total time=   0.7s




[CV 5/5] END ...C=0.33, penalty=l1, solver=saga;, score=0.744 total time=   0.7s
[CV 1/5] END C=0.16, penalty=none, solver=newton-cg;, score=0.969 total time=   0.1s
[CV 2/5] END C=0.16, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s
[CV 3/5] END C=0.16, penalty=none, solver=newton-cg;, score=0.975 total time=   0.1s
[CV 4/5] END C=0.16, penalty=none, solver=newton-cg;, score=0.988 total time=   0.1s




[CV 5/5] END C=0.16, penalty=none, solver=newton-cg;, score=0.981 total time=   0.1s
[CV 1/5] END C=0.73, penalty=none, solver=lbfgs;, score=0.969 total time=   0.0s
[CV 2/5] END C=0.73, penalty=none, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.73, penalty=none, solver=lbfgs;, score=0.975 total time=   0.0s
[CV 4/5] END C=0.73, penalty=none, solver=lbfgs;, score=0.988 total time=   0.0s
[CV 5/5] END C=0.73, penalty=none, solver=lbfgs;, score=0.969 total time=   0.0s
[CV 1/5] END ......C=0.71, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=0.71, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ......C=0.71, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ......C=0.71, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ......C=0.71, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.66, penalty=l1, solver=liblinear;, score=0.856 total time=   0.1s




[CV 2/5] END C=0.66, penalty=l1, solver=liblinear;, score=0.812 total time=   0.1s
[CV 3/5] END C=0.66, penalty=l1, solver=liblinear;, score=0.919 total time=   0.1s
[CV 4/5] END C=0.66, penalty=l1, solver=liblinear;, score=0.919 total time=   0.0s
[CV 5/5] END C=0.66, penalty=l1, solver=liblinear;, score=0.925 total time=   0.1s
[CV 1/5] END ...C=0.04, penalty=l2, solver=saga;, score=0.662 total time=   0.1s
[CV 2/5] END ...C=0.04, penalty=l2, solver=saga;, score=0.725 total time=   0.1s
[CV 3/5] END ...C=0.04, penalty=l2, solver=saga;, score=0.662 total time=   0.1s
[CV 4/5] END ...C=0.04, penalty=l2, solver=saga;, score=0.637 total time=   0.1s
[CV 5/5] END ...C=0.04, penalty=l2, solver=saga;, score=0.656 total time=   0.1s


35 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

-----




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[422   4]
 [  2 372]] 

Accuracy: 99.2 

F1 Score: 99.2 


----- TEST -----

Confusion matrix
 [[106   1]
 [  1  92]] 

Accuracy: 99.0 

F1 Score: 98.9 

_______________________________________________________________________




### GPT2 + Neural Network

In [70]:
input_dim = X_train_embeddings_gpt2.shape[1]  # Dimensionality of word embeddings
hidden_dim = X_train_embeddings_gpt2.shape[1]*2  # Number of units in the hidden layer

classifier = NeuralNetworkClassifier(input_dim, hidden_dim)

# Fit the model
classifier.fit(X_train_embeddings_gpt2, y_train, num_epochs=10, lr=0.001)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_nn_gpt2_train, accuracy_nn_gpt2_train, f1_nn_gpt2_train = classifier.evaluate(X_train_embeddings_gpt2, y_train)

print('\n----- TEST -----')
cm_nn_gpt2_test, accuracy_nn_gpt2_test, f1_nn_gpt2_test = classifier.evaluate(X_test_embeddings_gpt2, y_test)
print('_______________________________________________________________________')

Epoch: 100%|██████████| 10/10 [01:49<00:00, 10.94s/it]




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[426   0]
 [  6 368]] 

Accuracy: 99.2 

F1 Score: 99.2 


----- TEST -----

Confusion matrix
 [[107   0]
 [  3  90]] 

Accuracy: 98.5 

F1 Score: 98.4 

_______________________________________________________________________





## RoBERTa

### RoBERTa + KNN

In [71]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_roberta, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_roberta, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_roberta_train, accuracy_knn_roberta_train, f1_knn_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_knn_roberta_test, accuracy_knn_roberta_test, f1_knn_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=manhattan, n_neighbors=3, weights=uniform;, score=0.963 total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=3, weights=uniform;, score=0.994 total time=   0.0s
[CV 3/5] END metric=manhattan, n_neighbors=3, weights=uniform;, score=0.969 total time=   0.0s
[CV 4/5] END metric=manhattan, n_neighbors=3, weights=uniform;, score=0.994 total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=3, weights=uniform;, score=0.981 total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=8, weights=distance;, score=0.969 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=8, weights=distance;, score=0.988 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=8, weights=distance;, score=0.981 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=8, weights=distance;, score=1.000 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=8, weights=distance;, score=1.000 to

### RoBERTa + XGBoost

In [72]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_roberta, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_roberta, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_roberta_train, accuracy_xgb_roberta_train, f1_xgb_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_xgb_roberta_test, accuracy_xgb_roberta_test, f1_xgb_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.938 total time=   0.7s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.963 total time=   0.7s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.988 total time=   0.7s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.981 total time=   0.7s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.969 total time=   0.7s
[CV 1/5] END colsample_bytree=0.5, lea

### RoBERTa + Random Forest

In [73]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25],
    'max_features': [5, 10],
    'max_depth': [10, 50, None],
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_roberta, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_roberta, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_roberta_train, accuracy_rf_roberta_train, f1_rf_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_rf_roberta_test, accuracy_rf_roberta_test, f1_rf_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=10;, score=0.969 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=10;, score=0.938 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=10;, score=0.944 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=10;, score=0.988 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=10;, score=0.950 total time=   0.0s
[CV 1/5] END bootstrap=False, max_depth=50, max_features=10, n_estimators=25;, score=1.000 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=50, max_features=10, n_estimators=25;, score=1.000 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=50, max_features=10, n_estimators=25;, score=0.969 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=50, max_features=10, n_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### RoBERTa + SVC

In [74]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_roberta, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_roberta, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_roberta_train, accuracy_svc_roberta_train, f1_svc_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_svc_roberta_test, accuracy_svc_roberta_test, f1_svc_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=1.000 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.988 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=1.000 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=1.000 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.1s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.1s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.988 total time=   0.1s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.994 total time=   0.1s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.981 total time=   0.1s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.925 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### RoBERTa + Logistic Regression

In [75]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_roberta, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_roberta, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_roberta_train, accuracy_lr_roberta_train, f1_lr_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_lr_roberta_test, accuracy_lr_roberta_test, f1_lr_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV 1/5] END ..C=0.28, penalty=none, solver=sag;, score=1.000 total time=   0.4s




[CV 2/5] END ..C=0.28, penalty=none, solver=sag;, score=1.000 total time=   0.3s




[CV 3/5] END ..C=0.28, penalty=none, solver=sag;, score=0.988 total time=   0.4s




[CV 4/5] END ..C=0.28, penalty=none, solver=sag;, score=1.000 total time=   0.3s




[CV 5/5] END ..C=0.28, penalty=none, solver=sag;, score=0.994 total time=   0.4s
[CV 1/5] END C=0.47000000000000003, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.47000000000000003, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.47000000000000003, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.47000000000000003, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.47000000000000003, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END C=0.9500000000000001, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.9500000000000001, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.9500000000000001, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.9500000000000001, penalty=elasticnet, solver=newton-cg;, score=nan total 



[CV 1/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.994 total time=   0.4s




[CV 2/5] END .....C=0.6, penalty=l2, solver=sag;, score=1.000 total time=   0.4s




[CV 3/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.988 total time=   0.3s




[CV 4/5] END .....C=0.6, penalty=l2, solver=sag;, score=1.000 total time=   0.3s




[CV 5/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.994 total time=   0.3s
[CV 1/5] END C=0.59, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 2/5] END C=0.59, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.59, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.59, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.59, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 1/5] END C=0.59, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.59, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.59, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.59, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.59, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 1/5] END C=0.24, penalty=none, solver=lbfgs;, score=0.994 total time=   0.0s
[CV



[CV 2/5] END C=0.38, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 3/5] END C=0.38, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.38, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.38, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s




[CV 1/5] END C=0.47000000000000003, penalty=l2, solver=saga;, score=0.994 total time=   0.4s




[CV 2/5] END C=0.47000000000000003, penalty=l2, solver=saga;, score=1.000 total time=   0.4s




[CV 3/5] END C=0.47000000000000003, penalty=l2, solver=saga;, score=0.988 total time=   0.4s




[CV 4/5] END C=0.47000000000000003, penalty=l2, solver=saga;, score=1.000 total time=   0.4s




[CV 5/5] END C=0.47000000000000003, penalty=l2, solver=saga;, score=0.988 total time=   0.4s
[CV 1/5] END C=0.39, penalty=l2, solver=newton-cg;, score=0.988 total time=   0.1s
[CV 2/5] END C=0.39, penalty=l2, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.39, penalty=l2, solver=newton-cg;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.39, penalty=l2, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.39, penalty=l2, solver=newton-cg;, score=0.994 total time=   0.1s




[CV 1/5] END ...C=0.56, penalty=l2, solver=saga;, score=0.994 total time=   0.4s




[CV 2/5] END ...C=0.56, penalty=l2, solver=saga;, score=1.000 total time=   0.4s




[CV 3/5] END ...C=0.56, penalty=l2, solver=saga;, score=0.988 total time=   0.4s




[CV 4/5] END ...C=0.56, penalty=l2, solver=saga;, score=1.000 total time=   0.4s




[CV 5/5] END ...C=0.56, penalty=l2, solver=saga;, score=0.988 total time=   0.5s
[CV 1/5] END C=0.43, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 2/5] END C=0.43, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 3/5] END C=0.43, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.43, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.43, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s
[CV 1/5] END C=0.39, penalty=none, solver=lbfgs;, score=0.994 total time=   0.0s
[CV 2/5] END C=0.39, penalty=none, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.39, penalty=none, solver=lbfgs;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.39, penalty=none, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.39, penalty=none, solver=lbfgs;, score=1.000 total time=   0.0s




[CV 1/5] END ..C=0.02, penalty=none, solver=sag;, score=1.000 total time=   0.4s




[CV 2/5] END ..C=0.02, penalty=none, solver=sag;, score=1.000 total time=   0.4s




[CV 3/5] END ..C=0.02, penalty=none, solver=sag;, score=0.988 total time=   0.4s




[CV 4/5] END ..C=0.02, penalty=none, solver=sag;, score=1.000 total time=   0.3s




[CV 5/5] END ..C=0.02, penalty=none, solver=sag;, score=0.994 total time=   0.4s
[CV 1/5] END C=0.81, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 2/5] END C=0.81, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.81, penalty=l2, solver=liblinear;, score=0.988 total time=   0.0s
[CV 4/5] END C=0.81, penalty=l2, solver=liblinear;, score=1.000 total time=   0.0s
[CV 5/5] END C=0.81, penalty=l2, solver=liblinear;, score=0.994 total time=   0.0s
[CV 1/5] END C=0.27, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.27, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.27, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.27, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.27, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s




[CV 1/5] END .C=0.08, penalty=none, solver=saga;, score=0.994 total time=   0.4s




[CV 2/5] END .C=0.08, penalty=none, solver=saga;, score=1.000 total time=   0.4s




[CV 3/5] END .C=0.08, penalty=none, solver=saga;, score=0.988 total time=   0.4s




[CV 4/5] END .C=0.08, penalty=none, solver=saga;, score=1.000 total time=   0.4s




[CV 5/5] END .C=0.08, penalty=none, solver=saga;, score=0.994 total time=   0.4s
[CV 1/5] END C=0.8, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s
[CV 2/5] END C=0.8, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s
[CV 3/5] END C=0.8, penalty=none, solver=newton-cg;, score=0.988 total time=   0.1s
[CV 4/5] END C=0.8, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s
[CV 5/5] END C=0.8, penalty=none, solver=newton-cg;, score=1.000 total time=   0.1s


30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

---




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[426   0]
 [  0 374]] 

Accuracy: 100.0 

F1 Score: 100.0 


----- TEST -----

Confusion matrix
 [[107   0]
 [  0  93]] 

Accuracy: 100.0 

F1 Score: 100.0 

_______________________________________________________________________


### RoBERTa + Neural Network

In [76]:
input_dim = X_train_embeddings_roberta.shape[1]  # Dimensionality of word embeddings
hidden_dim = X_train_embeddings_roberta.shape[1]*2  # Number of units in the hidden layer

classifier = NeuralNetworkClassifier(input_dim, hidden_dim)

# Fit the model
classifier.fit(X_train_embeddings_roberta, y_train, num_epochs=10, lr=0.001)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_nn_roberta_train, accuracy_nn_roberta_train, f1_nn_roberta_train = classifier.evaluate(X_train_embeddings_roberta, y_train)

print('\n----- TEST -----')
cm_nn_roberta_test, accuracy_nn_roberta_test, f1_nn_roberta_test = classifier.evaluate(X_test_embeddings_roberta, y_test)
print('_______________________________________________________________________')

Epoch: 100%|██████████| 10/10 [01:49<00:00, 10.92s/it]




_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[426   0]
 [  0 374]] 

Accuracy: 100.0 

F1 Score: 100.0 


----- TEST -----

Confusion matrix
 [[107   0]
 [  0  93]] 

Accuracy: 100.0 

F1 Score: 100.0 

_______________________________________________________________________





# Results

In [85]:
#create data
data = [["KNN+BERT", accuracy_knn_bert_test, f1_knn_bert_test],
        ["XGBoost+BERT", accuracy_xgb_bert_test, f1_xgb_bert_test],
        ["RandomForest+BERT", accuracy_rf_bert_test, f1_rf_bert_test],
        ["SVC+BERT", accuracy_svc_bert_test, f1_svc_bert_test],
        ["LR+BERT", accuracy_lr_bert_test, f1_lr_bert_test],
        ["NeuralNetwork+BERT", accuracy_nn_bert_test, f1_nn_bert_test],
        ["KNN+GloVe", accuracy_knn_glove_test, f1_knn_glove_test],
        ["XGBoost+GloVe", accuracy_xgb_glove_test, f1_xgb_glove_test],
        ["RandomForest+GloVe", accuracy_rf_glove_test, f1_rf_glove_test],
        ["SVC+GloVe", accuracy_svc_glove_test, f1_svc_glove_test],
        ["LR+GloVe", accuracy_lr_glove_test, f1_lr_glove_test],
        ["NeuralNetwork+GloVe", accuracy_nn_glove_test, f1_nn_glove_test],
        ["KNN+Word2Vec", accuracy_knn_w2v_test, f1_knn_w2v_test],
        ["XGBoost+Word2Vec", accuracy_xgb_w2v_test, f1_xgb_w2v_test],
        ["RandomForest + Word2Vec", accuracy_rf_w2v_test, f1_rf_w2v_test],
        ["SVC+Word2Vec", accuracy_svc_w2v_test, f1_svc_w2v_test],
        ["LR+Word2Vec", accuracy_lr_w2v_test, f1_lr_w2v_test],
        ["NeuralNetwork+Word2Vec", accuracy_nn_w2v_test, f1_nn_w2v_test],
        ["KNN+GPT2", accuracy_knn_gpt2_test, f1_knn_gpt2_test],
        ["XGBoost+GPT2", accuracy_xgb_gpt2_test, f1_xgb_gpt2_test],
        ["RandomForest+GPT2", accuracy_rf_gpt2_test, f1_rf_gpt2_test],
        ["SVC+GPT2", accuracy_svc_gpt2_test, f1_svc_gpt2_test],
        ["LR+GPT2", accuracy_lr_gpt2_test, f1_lr_gpt2_test],
        ["NeuralNetwork+GPT2", accuracy_nn_gpt2_test, f1_nn_gpt2_test],
        ["KNN+RoBERTa", accuracy_knn_roberta_test, f1_knn_roberta_test],
        ["XGBoost+RoBERTa", accuracy_xgb_roberta_test, f1_xgb_roberta_test],
        ["RandomForest+RoBERTa", accuracy_rf_roberta_test, f1_rf_roberta_test],
        ["SVC+RoBERTa", accuracy_svc_roberta_test, f1_svc_roberta_test],
        ["LR+RoBERTa", accuracy_lr_roberta_test, f1_lr_roberta_test],
        ["NeuralNetwork+RoBERTa", accuracy_nn_roberta_test, f1_nn_roberta_test]]
  
#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#save results to csv
if fast:
    with open("results_fast_"+time.strftime("%Y%m%d-%H%M%S")+".csv", mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(col_names)
        for row in data:
            writer.writerow(row)
else:
    with open("results_"+time.strftime("%Y%m%d-%H%M%S")+".csv", mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(col_names)
        for row in data:
            writer.writerow(row)

#display table
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

╒═════════════════════════╤════════════╤════════════╕
│ Model                   │   Accuracy │   F1-Score │
╞═════════════════════════╪════════════╪════════════╡
│ KNN+BERT                │       95.5 │       95.2 │
├─────────────────────────┼────────────┼────────────┤
│ XGBoost+BERT            │       95.5 │       95.3 │
├─────────────────────────┼────────────┼────────────┤
│ RandomForest+BERT       │       94   │       93.8 │
├─────────────────────────┼────────────┼────────────┤
│ SVC+BERT                │       97.5 │       97.4 │
├─────────────────────────┼────────────┼────────────┤
│ LR+BERT                 │       96.5 │       96.2 │
├─────────────────────────┼────────────┼────────────┤
│ NeuralNetwork+BERT      │       95.5 │       95.3 │
├─────────────────────────┼────────────┼────────────┤
│ KNN+GloVe               │       92.5 │       92   │
├─────────────────────────┼────────────┼────────────┤
│ XGBoost+GloVe           │       96   │       95.7 │
├─────────────────────────┼─

In [86]:
#print(tabulate(data, headers=col_names, tablefmt="latex"))

\begin{tabular}{lrr}
\hline
 Model                   &   Accuracy &   F1-Score \\
\hline
 KNN+BERT                &       95.5 &       95.2 \\
 XGBoost+BERT            &       95.5 &       95.3 \\
 RandomForest+BERT       &       94   &       93.8 \\
 SVC+BERT                &       97.5 &       97.4 \\
 LR+BERT                 &       96.5 &       96.2 \\
 NeuralNetwork+BERT      &       95.5 &       95.3 \\
 KNN+GloVe               &       92.5 &       92   \\
 XGBoost+GloVe           &       96   &       95.7 \\
 RandomForest+GloVe      &       93.5 &       93.2 \\
 SVC+GloVe               &       98.5 &       98.4 \\
 LR+GloVe                &       98.5 &       98.4 \\
 NeuralNetwork+GloVe     &       92.5 &       91.4 \\
 KNN+Word2Vec            &       89   &       87.8 \\
 XGBoost+Word2Vec        &       96   &       95.6 \\
 RandomForest + Word2Vec &       94.5 &       94.1 \\
 SVC+Word2Vec            &       98.5 &       98.4 \\
 LR+Word2Vec             &       98   &       9

In [79]:
#create data
knn_results = [["BERT", accuracy_knn_bert_test, f1_knn_bert_test],
        ["GloVe", accuracy_knn_glove_test, f1_knn_glove_test],
        ["Word2Vec", accuracy_knn_w2v_test, f1_knn_w2v_test],
        ["GPT2", accuracy_knn_gpt2_test, f1_knn_gpt2_test],
        ["RoBERTa", accuracy_knn_roberta_test, f1_knn_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("KNN")
print(tabulate(knn_results, headers=col_names, tablefmt="fancy_grid"))

KNN
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       95.5 │       95.2 │
├──────────┼────────────┼────────────┤
│ GloVe    │       92.5 │       92   │
├──────────┼────────────┼────────────┤
│ Word2Vec │       89   │       87.8 │
├──────────┼────────────┼────────────┤
│ GPT2     │       96.5 │       96.2 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │       99   │       98.9 │
╘══════════╧════════════╧════════════╛


In [80]:
#create data
xgb_results = [["BERT", accuracy_xgb_bert_test, f1_xgb_bert_test],
        ["GloVe", accuracy_xgb_glove_test, f1_xgb_glove_test],
        ["Word2Vec", accuracy_xgb_w2v_test, f1_xgb_w2v_test],
        ["GPT2", accuracy_xgb_gpt2_test, f1_xgb_gpt2_test],
        ["RoBERTa", accuracy_xgb_roberta_test, f1_xgb_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("XGBoost")
print(tabulate(xgb_results, headers=col_names, tablefmt="fancy_grid"))

XGBoost
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       95.5 │       95.3 │
├──────────┼────────────┼────────────┤
│ GloVe    │       96   │       95.7 │
├──────────┼────────────┼────────────┤
│ Word2Vec │       96   │       95.6 │
├──────────┼────────────┼────────────┤
│ GPT2     │       98   │       97.8 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │      100   │      100   │
╘══════════╧════════════╧════════════╛


In [81]:
#create data
rf_results = [["BERT", accuracy_rf_bert_test, f1_rf_bert_test],
        ["GloVe", accuracy_rf_glove_test, f1_rf_glove_test],
        ["Word2Vec", accuracy_rf_w2v_test, f1_rf_w2v_test],
        ["GPT2", accuracy_rf_gpt2_test, f1_rf_gpt2_test],
        ["RoBERTa", accuracy_rf_roberta_test, f1_rf_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("Random Forest")
print(tabulate(rf_results, headers=col_names, tablefmt="fancy_grid"))

Random Forest
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       94   │       93.8 │
├──────────┼────────────┼────────────┤
│ GloVe    │       93.5 │       93.2 │
├──────────┼────────────┼────────────┤
│ Word2Vec │       94.5 │       94.1 │
├──────────┼────────────┼────────────┤
│ GPT2     │       96   │       95.7 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │       98   │       97.8 │
╘══════════╧════════════╧════════════╛


In [82]:
#create data
svc_results = [["BERT", accuracy_svc_bert_test, f1_svc_bert_test],
        ["GloVe", accuracy_svc_glove_test, f1_svc_glove_test],
        ["Word2Vec", accuracy_svc_w2v_test, f1_svc_w2v_test],
        ["GPT2", accuracy_svc_gpt2_test, f1_svc_gpt2_test],
        ["RoBERTa", accuracy_svc_roberta_test, f1_svc_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("SVM Classifier")
print(tabulate(svc_results, headers=col_names, tablefmt="fancy_grid"))

SVM Classifier
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       97.5 │       97.4 │
├──────────┼────────────┼────────────┤
│ GloVe    │       98.5 │       98.4 │
├──────────┼────────────┼────────────┤
│ Word2Vec │       98.5 │       98.4 │
├──────────┼────────────┼────────────┤
│ GPT2     │       99.5 │       99.5 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │      100   │      100   │
╘══════════╧════════════╧════════════╛


In [83]:
#create data
lr_results = [["BERT", accuracy_lr_bert_test, f1_lr_bert_test],
        ["GloVe", accuracy_lr_glove_test, f1_lr_glove_test],
        ["Word2Vec", accuracy_lr_w2v_test, f1_lr_w2v_test],
        ["GPT2", accuracy_lr_gpt2_test, f1_lr_gpt2_test],
        ["RoBERTa", accuracy_lr_roberta_test, f1_lr_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("Logistic Regression")
print(tabulate(lr_results, headers=col_names, tablefmt="fancy_grid"))

Logistic Regression
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       96.5 │       96.2 │
├──────────┼────────────┼────────────┤
│ GloVe    │       98.5 │       98.4 │
├──────────┼────────────┼────────────┤
│ Word2Vec │       98   │       97.9 │
├──────────┼────────────┼────────────┤
│ GPT2     │       99   │       98.9 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │      100   │      100   │
╘══════════╧════════════╧════════════╛


In [84]:
#create data
nn_results = [["BERT", accuracy_nn_bert_test, f1_nn_bert_test],
        ["GloVe", accuracy_nn_glove_test, f1_nn_glove_test],
        ["Word2Vec", accuracy_nn_w2v_test, f1_nn_w2v_test],
        ["GPT2", accuracy_nn_gpt2_test, f1_nn_gpt2_test],
        ["RoBERTa", accuracy_nn_roberta_test, f1_nn_roberta_test]]

#define header names
col_names = ["Model", "Accuracy", "F1-Score"]

#display table
print("NeuralNetwork")
print(tabulate(nn_results, headers=col_names, tablefmt="fancy_grid"))

NeuralNetwork
╒══════════╤════════════╤════════════╕
│ Model    │   Accuracy │   F1-Score │
╞══════════╪════════════╪════════════╡
│ BERT     │       95.5 │       95.3 │
├──────────┼────────────┼────────────┤
│ GloVe    │       92.5 │       91.4 │
├──────────┼────────────┼────────────┤
│ Word2Vec │       97.5 │       97.3 │
├──────────┼────────────┼────────────┤
│ GPT2     │       98.5 │       98.4 │
├──────────┼────────────┼────────────┤
│ RoBERTa  │      100   │      100   │
╘══════════╧════════════╧════════════╛


---