# BERT word embeddings + various classification algorithms

In [17]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

import time
from tqdm import tqdm
from bs4 import BeautifulSoup
import re, string
import nltk
from nltk.corpus import stopwords
import pickle
from tabulate import tabulate

import xgboost as xgb

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
fake = pd.read_csv('../../data/Fake.csv')
true = pd.read_csv('../../data/True.csv')

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index = True)

df['text'] = df['title'] + " " + df['text']
df.drop(columns=['title', 'date', 'subject'], inplace = True)

In [3]:
%%time
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    text = re.sub('\[[^]]*\]', '', text)
    return re.sub(r'http\S+', '', text)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
    
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

df['text']=df['text'].apply(denoise_text)

[nltk_data] Downloading package stopwords to /home/szymon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 8.45 s, sys: 58.9 ms, total: 8.51 s
Wall time: 8.73 s


---

Reduce dataset for testing purposes

In [4]:
# df_original = df.copy()
# df = df.sample(frac=1).reset_index(drop=True)[:1000]

---

# Embedding

In [5]:
redo_embedding = False # recalculate embeddings
fast = True # True if use reduced dataset (1000 obs) vs. False if full dataset (40000 obs)

In [6]:
%%time
# Load data
if redo_embedding:
    X = df['text'].tolist()
    y = df['label'].tolist()

    with open("X", "wb") as fp:
      pickle.dump(X, fp)
    with open("y", "wb") as fp:
      pickle.dump(y, fp)
else:
    with open("../../data/X", "rb") as fp:
      X = pickle.load(fp)
    with open("../../data/y", "rb") as fp:
      y = pickle.load(fp)
    
if redo_embedding:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    with open("X_train", "wb") as fp:
      pickle.dump(X_train, fp)
    with open("X_test", "wb") as fp:
      pickle.dump(X_test, fp)
    with open("y_train", "wb") as fp:
      pickle.dump(y_train, fp)
    with open("y_test", "wb") as fp:
      pickle.dump(y_test, fp)
elif fast:
    with open("../../data/small/X_train", "rb") as fp:
      X_train = pickle.load(fp)
    with open("../../data/small/X_test", "rb") as fp:
      X_test = pickle.load(fp)   
    with open("../../data/small/y_train", "rb") as fp:
      y_train = pickle.load(fp)
    with open("../../data/small/y_test", "rb") as fp:
      y_test = pickle.load(fp)
else:
    with open("../../data/X_train", "rb") as fp:
      X_train = pickle.load(fp)
    with open("../../data/X_test", "rb") as fp:
      X_test = pickle.load(fp)   
    with open("../../data/y_train", "rb") as fp:
      y_train = pickle.load(fp)
    with open("../../data/y_test", "rb") as fp:
      y_test = pickle.load(fp)

CPU times: user 55.9 ms, sys: 44.5 ms, total: 100 ms
Wall time: 102 ms


## BERT Embedding

In [7]:
if redo_embedding:
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert.to(device)

    def _get_bert_embedding(text):
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = np.array(input_ids)
        input_ids = np.expand_dims(input_ids, axis=0)
        input_ids = torch.tensor(input_ids).to(device)

        with torch.no_grad():
            outputs = bert(input_ids)
            last_hidden_state = outputs.last_hidden_state
            last_hidden_state = last_hidden_state[:, 0, :].cpu().numpy()

        return last_hidden_state

    print("TRAIN")
    X_train_embeddings = []
    for text in tqdm(X_train):
        embedding = _get_bert_embedding(text)
        X_train_embeddings.append(embedding)
    X_train_embeddings = np.array(X_train_embeddings)
    X_train_embeddings_bert = np.squeeze(X_train_embeddings, axis=1)

    print("TEST")
    X_test_embeddings = []
    for text in tqdm(X_test):
        embedding = _get_bert_embedding(text)
        X_test_embeddings.append(embedding)
    X_test_embeddings = np.array(X_test_embeddings)
    X_test_embeddings_bert = np.squeeze(X_test_embeddings, axis=1)
    
    if fast:
        pd.DataFrame(X_train_embeddings_bert).to_csv("../../data/small/embeddings/X_train_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_bert).to_csv("../../data/small/embeddings/X_test_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_bert).to_csv("../../data/embeddings/X_train_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_bert).to_csv("../../data/embeddings/X_test_embeddings_bert_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    
elif fast:
    X_train_embeddings_bert = pd.read_csv('../../data/small/embeddings/X_train_embeddings_bert.csv', sep=',', header=None).values
    X_test_embeddings_bert = pd.read_csv('../../data/small/embeddings/X_test_embeddings_bert.csv', sep=',', header=None).values
else:
    X_train_embeddings_bert = pd.read_csv('../../data/bert/X_train_embeddings_bert.csv', sep=',', header=None).values
    X_test_embeddings_bert = pd.read_csv('../../data/bert/X_test_embeddings_bert.csv', sep=',', header=None).values

## GloVe Embedding

In [8]:
def load_glove_embeddings(filename):
    embeddings_index = {}
    with open(filename) as f:
        for line in tqdm(f):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

if redo_embedding:
    glove_embeddings = load_glove_embeddings('../../glove/glove.840B.300d.txt')

In [9]:
def text_to_glove_embeddings(text, embeddings_index, embedding_dim):
    embeddings = []
    for sentence in text:
        sentence_embeddings = []
        for word in sentence.split():
            if word in embeddings_index:
                sentence_embeddings.append(embeddings_index[word])
        if len(sentence_embeddings) > 0:
            embeddings.append(np.mean(sentence_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(embedding_dim))
    return np.array(embeddings)

if redo_embedding:
    # noinspection PyUnboundLocalVariable
    X_train_embeddings_glove = text_to_glove_embeddings(X_train, glove_embeddings, embedding_dim=300)
    X_test_embeddings_glove = text_to_glove_embeddings(X_test, glove_embeddings, embedding_dim=300)
    
    if fast:
        pd.DataFrame(X_train_embeddings_glove).to_csv("../../data/small/embeddings/X_train_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_glove).to_csv("../../data/small/embeddings/X_test_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
    else:
        pd.DataFrame(X_train_embeddings_glove).to_csv("../../data/embeddings/X_train_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)
        pd.DataFrame(X_test_embeddings_glove).to_csv("../../data/embeddings/X_test_embeddings_glove_"+time.strftime("%Y%m%d-%H%M%S")+".csv", index=False, header=False)

elif fast:
    X_train_embeddings_glove = pd.read_csv('../../data/small/embeddings/X_train_embeddings_glove.csv', sep=',', header=None).values
    X_test_embeddings_glove = pd.read_csv('../../data/small/embeddings/X_test_embeddings_glove.csv', sep=',', header=None).values

else:
    X_train_embeddings_glove = pd.read_csv('../../data/embeddings/X_train_embeddings_glove.csv', sep=',', header=None).values
    X_test_embeddings_glove = pd.read_csv('../../data/embeddings/X_test_embeddings_glove.csv', sep=',', header=None).values

## Word2Vec

In [10]:
# WIP

# Classification

## Wrappers

### KNN

In [11]:
class KNNClassifier:
    def __init__(self, n_neighbors=2, weights='uniform', metric='minkowski'):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric
        self.model = None

    def fit(self, X_train, y_train):
        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)
        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = KNeighborsClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_neighbors = random_search.best_params_['n_neighbors']
        self.weights = random_search.best_params_['weights']
        self.metric = random_search.best_params_['metric']

        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### XGBoost

In [12]:
class XGBoostClassifier:
    def __init__(self, learning_rate=0.1, max_depth=5, min_child_weight=1, subsample=0.5, colsample_bytree=0.5, n_estimators=100, objective='req:squarederror'):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_estimators = n_estimators
        self.objective = objective
        self.model = None

    def fit(self, X_train, y_train):
        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = xgb.XGBClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.learning_rate = random_search.best_params_['learning_rate']
        self.max_depth = random_search.best_params_['max_depth']
        self.min_child_weight = random_search.best_params_['min_child_weight']
        self.subsample = random_search.best_params_['subsample']
        self.colsample_bytree = random_search.best_params_['colsample_bytree']
        self.n_estimators = random_search.best_params_['n_estimators']
        self.objective = random_search.best_params_['objective']

        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### Random Forest

In [13]:
class RFClassifier:
    def __init__(self, n_estimators=100, max_features='sqrt', max_depth='none', bootstrap=True):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.bootstrap = bootstrap
        self.model = None

    def fit(self, X_train, y_train):
        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = RandomForestClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_estimators = random_search.best_params_['n_estimators']
        self.max_features = random_search.best_params_['max_features']
        self.max_depth = random_search.best_params_['max_depth']
        self.bootstrap = random_search.best_params_['bootstrap']

        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### SVC

In [14]:
class SVClassifier:
    def __init__(self, C = 1, kernel='linear', gamma = 0.2):
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.model = None

    def fit(self, X_train, y_train):
        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = svm.SVC()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.C = random_search.best_params_['C']
        self.kernel = random_search.best_params_['kernel']
        self.gamma = random_search.best_params_['gamma']

        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

### Logistic Regression

In [15]:
class LRClassifier:
    def __init__(self, penalty = 'l2', solver = 'libinear', C = 0.5):
        self.penalty = penalty
        self.solver = solver
        self.C = C
        self.model = None

    def fit(self, X_train, y_train):
        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = LogisticRegression()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.penalty = random_search.best_params_['penalty']
        self.solver = random_search.best_params_['solver']
        self.C = random_search.best_params_['C']

        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = round(accuracy_score(y_test, y_pred)*100,1)
        f1 = round(f1_score(y_test, y_pred)*100,1)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

## BERT embeddings

### BERT + KNN

In [16]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_bert_train, accuracy_knn_bert_train, f1_knn_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_knn_bert_test, accuracy_knn_bert_test, f1_knn_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=manhattan, n_neighbors=3, weights=distance;, score=0.950 total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=3, weights=distance;, score=0.975 total time=   0.0s
[CV 3/5] END metric=manhattan, n_neighbors=3, weights=distance;, score=0.938 total time=   0.0s
[CV 4/5] END metric=manhattan, n_neighbors=3, weights=distance;, score=0.944 total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=3, weights=distance;, score=0.956 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=2, weights=uniform;, score=0.950 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=2, weights=uniform;, score=0.944 total time=   0.0s
[CV 3/5] END metric=minkowski, n_neighbors=2, weights=uniform;, score=0.938 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=2, weights=uniform;, score=0.938 total time=   0.0s
[CV 5/5] END metric=minkowski, n_neighbors=2, weights=uniform;, score=0.950 to

### BERT + XGBoost

In [50]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_bert_train, accuracy_xgb_bert_train, f1_xgb_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_xgb_bert_test, accuracy_xgb_bert_test, f1_xgb_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.969 total time=   0.8s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.956 total time=   0.8s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.950 total time=   0.9s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.963 total time=   0.8s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=200, objective=reg:squarederror, subsample=0.5;, score=0.963 total time=   0.8s
[CV 1/5] END colsample_bytree=0.5, lea

### BERT + Random Forest

In [51]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25], 
    'max_features': [5, 10],
    'max_depth': [10, 50, None], 
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_bert_train, accuracy_rf_bert_train, f1_rf_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_rf_bert_test, accuracy_rf_bert_test, f1_rf_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=25;, score=0.963 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=25;, score=0.944 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=25;, score=0.938 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=25;, score=0.938 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=25;, score=0.956 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=25;, score=0.944 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=25;, score=0.950 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=10, n_estimators=25;, score=0.919 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=10, n_esti

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### BERT + SVC

In [52]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_bert_train, accuracy_svc_bert_train, f1_svc_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_svc_bert_test, accuracy_svc_bert_test, f1_svc_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.988 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.994 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.981 total time=   0.0s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.988 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.975 total time=   0.0s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.994 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### BERT + Logistic Regression

In [53]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_bert, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_bert, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_bert_train, accuracy_lr_bert_train, f1_lr_bert_train = classifier.evaluate(X_train_embeddings_bert, y_train)

print('\n----- TEST -----')
cm_lr_bert_test, accuracy_lr_bert_test, f1_lr_bert_test = classifier.evaluate(X_test_embeddings_bert, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END C=0.78, penalty=none, solver=newton-cg;, score=0.988 total time=   0.1s
[CV 2/5] END C=0.78, penalty=none, solver=newton-cg;, score=0.969 total time=   0.1s




[CV 3/5] END C=0.78, penalty=none, solver=newton-cg;, score=0.975 total time=   0.1s
[CV 4/5] END C=0.78, penalty=none, solver=newton-cg;, score=0.981 total time=   0.1s
[CV 5/5] END C=0.78, penalty=none, solver=newton-cg;, score=0.994 total time=   0.1s
[CV 1/5] END C=0.49, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END C=0.49, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END C=0.49, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END C=0.49, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END C=0.49, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s




[CV 1/5] END ....C=0.54, penalty=l2, solver=sag;, score=0.981 total time=   0.4s




[CV 2/5] END ....C=0.54, penalty=l2, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END ....C=0.54, penalty=l2, solver=sag;, score=0.963 total time=   0.4s




[CV 4/5] END ....C=0.54, penalty=l2, solver=sag;, score=0.981 total time=   0.4s




[CV 5/5] END ....C=0.54, penalty=l2, solver=sag;, score=0.988 total time=   0.4s
[CV 1/5] END C=0.41000000000000003, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END C=0.41000000000000003, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END C=0.41000000000000003, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END C=0.41000000000000003, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END C=0.41000000000000003, penalty=none, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END C=0.32, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.32, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.32, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.32, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.32, penalty=elasticnet, solver=newton-cg;, score=nan tot



[CV 1/5] END .C=0.89, penalty=none, solver=saga;, score=0.981 total time=   0.4s




[CV 2/5] END .C=0.89, penalty=none, solver=saga;, score=0.969 total time=   0.4s




[CV 3/5] END .C=0.89, penalty=none, solver=saga;, score=0.969 total time=   0.4s




[CV 4/5] END .C=0.89, penalty=none, solver=saga;, score=0.981 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .C=0.89, penalty=none, solver=saga;, score=0.994 total time=   0.4s
[CV 1/5] END ..C=0.19, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.0s
[CV 2/5] END ..C=0.19, penalty=l2, solver=lbfgs;, score=0.969 total time=   0.0s
[CV 3/5] END ..C=0.19, penalty=l2, solver=lbfgs;, score=0.950 total time=   0.0s
[CV 4/5] END ..C=0.19, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.0s
[CV 5/5] END ..C=0.19, penalty=l2, solver=lbfgs;, score=0.981 total time=   0.0s




[CV 1/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.981 total time=   0.3s




[CV 2/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.969 total time=   0.4s




[CV 4/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.981 total time=   0.4s




[CV 5/5] END .....C=0.6, penalty=l2, solver=sag;, score=0.988 total time=   0.4s
[CV 1/5] END C=0.08, penalty=none, solver=lbfgs;, score=0.988 total time=   0.0s
[CV 2/5] END C=0.08, penalty=none, solver=lbfgs;, score=0.963 total time=   0.0s
[CV 3/5] END C=0.08, penalty=none, solver=lbfgs;, score=0.956 total time=   0.0s
[CV 4/5] END C=0.08, penalty=none, solver=lbfgs;, score=0.969 total time=   0.0s
[CV 5/5] END C=0.08, penalty=none, solver=lbfgs;, score=0.994 total time=   0.0s




[CV 1/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.981 total time=   0.4s




[CV 2/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.969 total time=   0.4s




[CV 4/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.981 total time=   0.4s




[CV 5/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.994 total time=   0.4s




[CV 1/5] END ....C=0.48, penalty=l2, solver=sag;, score=0.981 total time=   0.3s




[CV 2/5] END ....C=0.48, penalty=l2, solver=sag;, score=0.969 total time=   0.3s




[CV 3/5] END ....C=0.48, penalty=l2, solver=sag;, score=0.963 total time=   0.3s




[CV 4/5] END ....C=0.48, penalty=l2, solver=sag;, score=0.981 total time=   0.3s




[CV 5/5] END ....C=0.48, penalty=l2, solver=sag;, score=0.988 total time=   0.4s
[CV 1/5] END ......C=0.33, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=0.33, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ......C=0.33, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ......C=0.33, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ......C=0.33, penalty=l1, solver=sag;, score=nan total time=   0.0s




[CV 1/5] END ....C=0.01, penalty=l2, solver=sag;, score=0.956 total time=   0.3s




[CV 2/5] END ....C=0.01, penalty=l2, solver=sag;, score=0.938 total time=   0.3s




[CV 3/5] END ....C=0.01, penalty=l2, solver=sag;, score=0.900 total time=   0.4s




[CV 4/5] END ....C=0.01, penalty=l2, solver=sag;, score=0.963 total time=   0.4s




[CV 5/5] END ....C=0.01, penalty=l2, solver=sag;, score=0.956 total time=   0.4s
[CV 1/5] END C=0.26, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.26, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.26, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.26, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.26, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.72, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.72, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.72, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.72, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.72, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s




[CV 1/5] END ....C=0.68, penalty=l2, solver=sag;, score=0.981 total time=   0.4s




[CV 2/5] END ....C=0.68, penalty=l2, solver=sag;, score=0.969 total time=   0.4s




[CV 3/5] END ....C=0.68, penalty=l2, solver=sag;, score=0.969 total time=   0.4s




[CV 4/5] END ....C=0.68, penalty=l2, solver=sag;, score=0.981 total time=   0.4s




[CV 5/5] END ....C=0.68, penalty=l2, solver=sag;, score=0.994 total time=   0.3s




[CV 1/5] END ...C=0.21, penalty=l1, solver=saga;, score=0.975 total time=   0.6s




[CV 2/5] END ...C=0.21, penalty=l1, solver=saga;, score=0.944 total time=   0.5s




[CV 3/5] END ...C=0.21, penalty=l1, solver=saga;, score=0.931 total time=   0.5s




[CV 4/5] END ...C=0.21, penalty=l1, solver=saga;, score=0.950 total time=   0.5s
[CV 5/5] END ...C=0.21, penalty=l1, solver=saga;, score=0.950 total time=   0.6s



_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[410   0]
 [  0 390]] 

Accuracy: 100.0 

F1 Score: 100.0 


----- TEST -----

Confusion matrix
 [[101   2]
 [  1  96]] 

Accuracy: 98.5 

F1 Score: 98.5 

_______________________________________________________________________


45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalt

## GloVe embeddings

### GloVe + KNN

In [54]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_knn_glove_train, accuracy_knn_glove_train, f1_knn_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_knn_glove_test, accuracy_knn_glove_test, f1_knn_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=euclidean, n_neighbors=9, weights=uniform;, score=0.900 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=9, weights=uniform;, score=0.944 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=9, weights=uniform;, score=0.894 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=9, weights=uniform;, score=0.912 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=9, weights=uniform;, score=0.906 total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.900 total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.944 total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.894 total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.912 total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=9, weights=distance;, score=0.912 to

### GloVe + XGBoost

In [55]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_xgb_glove_train, accuracy_xgb_glove_train, f1_xgb_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_xgb_glove_test, accuracy_xgb_glove_test, f1_xgb_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=500, objective=reg:squarederror, subsample=0.5;, score=0.925 total time=   0.5s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=500, objective=reg:squarederror, subsample=0.5;, score=0.919 total time=   0.5s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=500, objective=reg:squarederror, subsample=0.5;, score=0.925 total time=   0.5s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=500, objective=reg:squarederror, subsample=0.5;, score=0.869 total time=   0.5s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=500, objective=reg:squarederror, subsample=0.5;, score=0.950 total time=   0.5s
[CV 1/5] END colsample_bytree=0.7, lea

### GloVe + Random Forest

In [56]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25], 
    'max_features': [5, 10],
    'max_depth': [10, 50, None], 
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_rf_glove_train, accuracy_rf_glove_train, f1_rf_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_rf_glove_test, accuracy_rf_glove_test, f1_rf_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=10;, score=0.894 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=10;, score=0.900 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=10;, score=0.869 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=10;, score=0.894 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=50, max_features=10, n_estimators=10;, score=0.838 total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=25;, score=0.900 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=25;, score=0.875 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=25;, score=0.894 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=10

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished


### GloVe + SVC

In [57]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_svc_glove_train, accuracy_svc_glove_train, f1_svc_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_svc_glove_test, accuracy_svc_glove_test, f1_svc_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.969 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.912 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.969 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.919 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.950 total time=   0.0s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.944 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.963 total time=   0.0s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### GloVe + Logistic Regression

In [58]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings_glove, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings_glove, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm_lr_glove_train, accuracy_lr_glove_train, f1_lr_glove_train = classifier.evaluate(X_train_embeddings_glove, y_train)

print('\n----- TEST -----')
cm_lr_glove_test, accuracy_lr_glove_test, f1_lr_glove_test = classifier.evaluate(X_test_embeddings_glove, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END C=0.35000000000000003, penalty=l1, solver=liblinear;, score=0.912 total time=   0.0s
[CV 2/5] END C=0.35000000000000003, penalty=l1, solver=liblinear;, score=0.887 total time=   0.0s
[CV 3/5] END C=0.35000000000000003, penalty=l1, solver=liblinear;, score=0.819 total time=   0.0s
[CV 4/5] END C=0.35000000000000003, penalty=l1, solver=liblinear;, score=0.900 total time=   0.0s
[CV 5/5] END C=0.35000000000000003, penalty=l1, solver=liblinear;, score=0.900 total time=   0.0s
[CV 1/5] END C=0.72, penalty=l2, solver=newton-cg;, score=0.956 total time=   0.0s
[CV 2/5] END C=0.72, penalty=l2, solver=newton-cg;, score=0.950 total time=   0.0s
[CV 3/5] END C=0.72, penalty=l2, solver=newton-cg;, score=0.900 total time=   0.0s
[CV 4/5] END C=0.72, penalty=l2, solver=newton-cg;, score=0.944 total time=   0.0s
[CV 5/5] END C=0.72, penalty=l2, solver=newton-cg;, score=0.938 total time=   0.0s




[CV 1/5] END ...C=0.49, penalty=l2, solver=saga;, score=0.956 total time=   0.2s
[CV 2/5] END ...C=0.49, penalty=l2, solver=saga;, score=0.950 total time=   0.2s




[CV 3/5] END ...C=0.49, penalty=l2, solver=saga;, score=0.887 total time=   0.2s
[CV 4/5] END ...C=0.49, penalty=l2, solver=saga;, score=0.950 total time=   0.2s




[CV 5/5] END ...C=0.49, penalty=l2, solver=saga;, score=0.931 total time=   0.2s
[CV 1/5] END C=0.37, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.37, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.37, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.37, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.37, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.17, penalty=none, solver=newton-cg;, score=0.981 total time=   0.0s
[CV 2/5] END C=0.17, penalty=none, solver=newton-cg;, score=0.950 total time=   0.0s
[CV 3/5] END C=0.17, penalty=none, solver=newton-cg;, score=0.950 total time=   0.0s
[CV 4/5] END C=0.17, penalty=none, solver=newton-cg;, score=0.969 total time=   0.0s
[CV 5/5] END C=0.17, penalty=none, solver=newton-cg;, score=0.963 total time=   0.0s




[CV 1/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.975 total time=   0.1s
[CV 2/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.969 total time=   0.2s




[CV 3/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.938 total time=   0.1s
[CV 4/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.969 total time=   0.1s




[CV 5/5] END C=0.41000000000000003, penalty=none, solver=sag;, score=0.981 total time=   0.1s
[CV 1/5] END ....C=0.79, penalty=l2, solver=sag;, score=0.956 total time=   0.1s
[CV 2/5] END ....C=0.79, penalty=l2, solver=sag;, score=0.963 total time=   0.1s
[CV 3/5] END ....C=0.79, penalty=l2, solver=sag;, score=0.900 total time=   0.1s
[CV 4/5] END ....C=0.79, penalty=l2, solver=sag;, score=0.944 total time=   0.1s
[CV 5/5] END ....C=0.79, penalty=l2, solver=sag;, score=0.944 total time=   0.1s
[CV 1/5] END C=0.48, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END C=0.48, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END C=0.48, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END C=0.48, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END C=0.48, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.36, penalty=elasticnet, solver=lbfgs;, score=nan total time



[CV 2/5] END C=0.6900000000000001, penalty=none, solver=sag;, score=0.969 total time=   0.1s
[CV 3/5] END C=0.6900000000000001, penalty=none, solver=sag;, score=0.931 total time=   0.1s




[CV 4/5] END C=0.6900000000000001, penalty=none, solver=sag;, score=0.969 total time=   0.1s
[CV 5/5] END C=0.6900000000000001, penalty=none, solver=sag;, score=0.988 total time=   0.2s
[CV 1/5] END C=0.02, penalty=l2, solver=liblinear;, score=0.919 total time=   0.0s
[CV 2/5] END C=0.02, penalty=l2, solver=liblinear;, score=0.919 total time=   0.0s
[CV 3/5] END C=0.02, penalty=l2, solver=liblinear;, score=0.819 total time=   0.0s
[CV 4/5] END C=0.02, penalty=l2, solver=liblinear;, score=0.900 total time=   0.0s
[CV 5/5] END C=0.02, penalty=l2, solver=liblinear;, score=0.919 total time=   0.0s
[CV 1/5] END C=0.55, penalty=l2, solver=liblinear;, score=0.956 total time=   0.0s
[CV 2/5] END C=0.55, penalty=l2, solver=liblinear;, score=0.944 total time=   0.0s
[CV 3/5] END C=0.55, penalty=l2, solver=liblinear;, score=0.894 total time=   0.0s
[CV 4/5] END C=0.55, penalty=l2, solver=liblinear;, score=0.944 total time=   0.0s
[CV 5/5] END C=0.55, penalty=l2, solver=liblinear;, score=0.944 tot



[CV 4/5] END C=0.02, penalty=none, solver=newton-cg;, score=0.969 total time=   0.0s
[CV 5/5] END C=0.02, penalty=none, solver=newton-cg;, score=0.963 total time=   0.0s
[CV 1/5] END .C=0.54, penalty=none, solver=saga;, score=0.975 total time=   0.2s




[CV 2/5] END .C=0.54, penalty=none, solver=saga;, score=0.975 total time=   0.2s
[CV 3/5] END .C=0.54, penalty=none, solver=saga;, score=0.925 total time=   0.2s




[CV 4/5] END .C=0.54, penalty=none, solver=saga;, score=0.963 total time=   0.2s
[CV 5/5] END .C=0.54, penalty=none, solver=saga;, score=0.975 total time=   0.2s




[CV 1/5] END ....C=0.16, penalty=l2, solver=sag;, score=0.956 total time=   0.0s
[CV 2/5] END ....C=0.16, penalty=l2, solver=sag;, score=0.925 total time=   0.0s
[CV 3/5] END ....C=0.16, penalty=l2, solver=sag;, score=0.875 total time=   0.0s
[CV 4/5] END ....C=0.16, penalty=l2, solver=sag;, score=0.931 total time=   0.0s
[CV 5/5] END ....C=0.16, penalty=l2, solver=sag;, score=0.919 total time=   0.0s




[CV 1/5] END C=0.35000000000000003, penalty=l1, solver=saga;, score=0.906 total time=   0.3s




[CV 2/5] END C=0.35000000000000003, penalty=l1, solver=saga;, score=0.887 total time=   0.2s




[CV 3/5] END C=0.35000000000000003, penalty=l1, solver=saga;, score=0.825 total time=   0.2s
[CV 4/5] END C=0.35000000000000003, penalty=l1, solver=saga;, score=0.912 total time=   0.2s




[CV 5/5] END C=0.35000000000000003, penalty=l1, solver=saga;, score=0.900 total time=   0.2s
[CV 1/5] END ..C=0.27, penalty=none, solver=sag;, score=0.975 total time=   0.1s




[CV 2/5] END ..C=0.27, penalty=none, solver=sag;, score=0.963 total time=   0.1s
[CV 3/5] END ..C=0.27, penalty=none, solver=sag;, score=0.925 total time=   0.1s




[CV 4/5] END ..C=0.27, penalty=none, solver=sag;, score=0.969 total time=   0.1s
[CV 5/5] END ..C=0.27, penalty=none, solver=sag;, score=0.975 total time=   0.1s




[CV 1/5] END ..C=0.61, penalty=none, solver=sag;, score=0.975 total time=   0.1s
[CV 2/5] END ..C=0.61, penalty=none, solver=sag;, score=0.975 total time=   0.1s




[CV 3/5] END ..C=0.61, penalty=none, solver=sag;, score=0.938 total time=   0.1s
[CV 4/5] END ..C=0.61, penalty=none, solver=sag;, score=0.969 total time=   0.1s


25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/szymon/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty.

[CV 5/5] END ..C=0.61, penalty=none, solver=sag;, score=0.975 total time=   0.1s



_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[409   1]
 [  0 390]] 

Accuracy: 99.9 

F1 Score: 99.9 


----- TEST -----

Confusion matrix
 [[100   3]
 [  4  93]] 

Accuracy: 96.5 

F1 Score: 96.4 

_______________________________________________________________________




## Word2Vec

### KNN

In [11]:
# WIP

### XGBoost

In [12]:
# WIP

### Random Forest

In [13]:
# WIP

### SVC

In [14]:
# WIP

### Logistic Regression

In [15]:
# WIP

# Results

In [59]:
#create data
data = [["KNN + BERT", accuracy_knn_bert_test, f1_knn_bert_test], 
        ["XGBoost + BERT", accuracy_xgb_bert_test, f1_xgb_bert_test], 
        ["Random Forest + BERT", accuracy_rf_bert_test, f1_rf_bert_test],
        ["SVC + BERT", accuracy_svc_bert_test, f1_svc_bert_test], 
        ["LR + BERT", accuracy_lr_bert_test, f1_lr_bert_test],
        ["KNN + GloVe", accuracy_knn_glove_test, f1_knn_glove_test],         
        ["XGBoost + GloVe", accuracy_xgb_glove_test, f1_xgb_glove_test],
        ["Random Forest + GloVe", accuracy_rf_glove_test, f1_rf_glove_test],
        ["SVC + GloVe", accuracy_svc_glove_test, f1_svc_glove_test],
        ["LR + GloVe", accuracy_lr_glove_test, f1_lr_glove_test]]
  
#define header names
col_names = ["Model", "Accuracy", "F1-Score"]
  
#display table
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

╒═══════════════════════╤════════════╤════════════╕
│ Model                 │   Accuracy │   F1-Score │
╞═══════════════════════╪════════════╪════════════╡
│ KNN + BERT            │       96.5 │       96.4 │
├───────────────────────┼────────────┼────────────┤
│ XGBoost + BERT        │       97   │       96.9 │
├───────────────────────┼────────────┼────────────┤
│ Random Forest + BERT  │       95.5 │       95.4 │
├───────────────────────┼────────────┼────────────┤
│ SVC + BERT            │       98.5 │       98.5 │
├───────────────────────┼────────────┼────────────┤
│ LR + BERT             │       98.5 │       98.5 │
├───────────────────────┼────────────┼────────────┤
│ KNN + GloVe           │       91   │       91   │
├───────────────────────┼────────────┼────────────┤
│ XGBoost + GloVe       │       91.5 │       91.4 │
├───────────────────────┼────────────┼────────────┤
│ Random Forest + GloVe │       89   │       89   │
├───────────────────────┼────────────┼────────────┤
│ SVC + GloV

---