# BERT word embeddings + various classification algorithms

In [38]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertModel, AutoTokenizer, AutoModel
import gc

import time
import datetime
from tqdm import tqdm
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.preprocessing import LabelBinarizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

import tensorflow as tf
import xgboost as xgb

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score, confusion_matrix

In [40]:
from google.colab import drive
drive.mount('/content/drive')

fake = pd.read_csv('/content/drive/MyDrive/master-thesis/thesis-data/Fake.csv')
true = pd.read_csv('/content/drive/MyDrive/master-thesis/thesis-data/True.csv')

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index = True)

df['text'] = df['title'] + " " + df['text']
df.drop(columns=['title', 'date', 'subject'], inplace = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
nltk.download('stopwords')

stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
    
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

#Apply function on review column
df['text']=df['text'].apply(denoise_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


In [None]:
#TEST IF WORKS!!! --------

nltk.download('stopwords')
stop = set(nltk.corpus.stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_noise(text):
    text = strip_html(text)
    text = re.sub(r'\[[^]]*\]|\bhttp\S+', '', text)
    text = ' '.join(word.lower() for word in text.split() if word.lower() not in stop)
    return text

df['text'] = df['text'].apply(remove_noise)
# ----------

---

Reduce dataset for testing purposes

In [42]:
df_original = df.copy()
df = df.sample(frac=1).reset_index(drop=True)[:1000]

---

# Embedding

## BERT Embedding

In [45]:
# Load data
X = df['text'].tolist()
y = df['label'].tolist()

# Split data into training and test sets

# # Old
# train_size = int(0.8 * len(X))
# X_train = X[:train_size]
# y_train = y[:train_size]
# X_test = X[train_size:]
# y_test = y[train_size:]

# New
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

def _get_bert_embedding(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=256)
    input_ids = np.array(input_ids)
    input_ids = np.expand_dims(input_ids, axis=0)
    input_ids = torch.tensor(input_ids)

    with torch.no_grad():
        outputs = bert(input_ids)
        last_hidden_state = outputs.last_hidden_state
        last_hidden_state = last_hidden_state[:, 0, :].numpy()

    return last_hidden_state

print("TRAIN")
X_train_embeddings = []
for text in tqdm(X_train):
    embedding = _get_bert_embedding(text)
    X_train_embeddings.append(embedding)
X_train_embeddings = np.array(X_train_embeddings)
X_train_embeddings = np.squeeze(X_train_embeddings, axis=1)

print("TEST")
X_test_embeddings = []
for text in tqdm(X_test):
    embedding = _get_bert_embedding(text)
    X_test_embeddings.append(embedding)
X_test_embeddings = np.array(X_test_embeddings)
X_test_embeddings = np.squeeze(X_test_embeddings, axis=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TRAIN


 10%|█         | 80/800 [01:11<10:40,  1.12it/s]


KeyboardInterrupt: ignored

In [None]:
X_train_embeddings_bert = X_train_embeddings.copy()
X_test_embeddings_bert = X_test_embeddings.copy()

X_train_split = X_train.copy()
X_test_split = X_test.copy()
y_train_split = y_train.copy()
y_test_split = y_test.copy()

# # Save current state -----------------------------------------------------------
# with open("/content/drive/MyDrive/master-thesis/embeddings/X_train", "wb") as fp:
#   pickle.dump(X_train, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/X_test", "wb") as fp:
#   pickle.dump(X_test, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y_train", "wb") as fp:
#   pickle.dump(y_train, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y_test", "wb") as fp:
#   pickle.dump(y_test, fp)

# pd.DataFrame(X_train_embeddings).to_csv("/content/drive/MyDrive/master-thesis/embeddings/X_train_embeddings.csv", index=False, header=False)
# pd.DataFrame(X_test_embeddings).to_csv("/content/drive/MyDrive/master-thesis/embeddings/X_test_embeddings.csv", index=False, header=False)

# with open("/content/drive/MyDrive/master-thesis/embeddings/X", "wb") as fp:
#   pickle.dump(X, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y", "wb") as fp:
#   pickle.dump(y, fp)
# # ------------------------------------------------------------------------------

AttributeError: ignored

In [None]:
# Retrieve data ----------------------------------------------------------------
with open("/content/drive/MyDrive/master-thesis/embeddings/X_train", "rb") as fp:
  X_train = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/X_test", "rb") as fp:
  X_test = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y_train", "rb") as fp:
  y_train = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y_test", "rb") as fp:
  y_test = pickle.load(fp)

X_train_embeddings = pd.read_csv('/content/drive/MyDrive/master-thesis/embeddings/X_train_embeddings.csv', sep=',', header=None).values
X_test_embeddings = pd.read_csv('/content/drive/MyDrive/master-thesis/embeddings/X_test_embeddings.csv', sep=',', header=None).values

with open("/content/drive/MyDrive/master-thesis/embeddings/X", "rb") as fp:
  X = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y", "rb") as fp:
  y = pickle.load(fp)
# ------------------------------------------------------------------------------

## ... Embedding

# Classification

## BERT embeddings

### KNN

In [None]:
class KNNClassifier:
    def __init__(self, n_neighbors=2, weights='uniform', metric='minkowski'):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric
        self.model = None

    def fit(self, X_train, y_train):
        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)
        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = KNeighborsClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_neighbors = random_search.best_params_['n_neighbors']
        self.weights = random_search.best_params_['weights']
        self.metric = random_search.best_params_['metric']

        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

In [None]:
# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
    'weights' : ['uniform','distance'],
    'metric' : ['minkowski','euclidean','manhattan']
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm, accuracy, f1 = classifier.evaluate(X_train_embeddings, y_train)

print('\n----- TEST -----')
cm, accuracy, f1 = classifier.evaluate(X_test_embeddings, y_test)
print('_______________________________________________________________________')

ValueError: ignored

### XGBoost

In [None]:
class XGBoostClassifier:
    def __init__(self, learning_rate=0.1, max_depth=5, min_child_weight=1, subsample=0.5, colsample_bytree=0.5, n_estimators=100, objective='req:squarederror'):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_estimators = n_estimators
        self.objective = objective
        self.model = None

    def fit(self, X_train, y_train):
        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = xgb.XGBClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.learning_rate = random_search.best_params_['learning_rate']
        self.max_depth = random_search.best_params_['max_depth']
        self.min_child_weight = random_search.best_params_['min_child_weight']
        self.subsample = random_search.best_params_['subsample']
        self.colsample_bytree = random_search.best_params_['colsample_bytree']
        self.n_estimators = random_search.best_params_['n_estimators']
        self.objective = random_search.best_params_['objective']

        self.model = xgb.XGBClassifier(learning_rate=self.learning_rate, max_depth=self.max_depth, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_estimators=self.n_estimators, objective=self.objective)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

In [None]:
# Instantiate classifier
classifier = XGBoostClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': [50, 100, 200, 500],
    'objective': ['reg:squarederror']
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_train_embeddings, y_train)

print('\n----- TEST -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50, objective=reg:squarederror, subsample=0.5;, score=0.975 total time=   2.3s
[CV 2/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50, objective=reg:squarederror, subsample=0.5;, score=0.950 total time=   2.5s
[CV 3/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50, objective=reg:squarederror, subsample=0.5;, score=0.925 total time=   3.3s
[CV 4/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50, objective=reg:squarederror, subsample=0.5;, score=0.925 total time=   2.0s
[CV 5/5] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=50, objective=reg:squarederror, subsample=0.5;, score=0.950 total time=   2.0s
[CV 1/5] END colsample_bytree=0.7, lea

### Random Forest

In [None]:
class RFClassifier:
    def __init__(self, n_estimators=100, max_features='sqrt', max_depth='none', bootstrap=True):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.bootstrap = bootstrap
        self.model = None

    def fit(self, X_train, y_train):
        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = RandomForestClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.n_estimators = random_search.best_params_['n_estimators']
        self.max_features = random_search.best_params_['max_features']
        self.max_depth = random_search.best_params_['max_depth']
        self.bootstrap = random_search.best_params_['bootstrap']

        self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_features=self.max_features, max_depth=self.max_depth, bootstrap=self.bootstrap)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

In [None]:
# Instantiate classifier
classifier = RFClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_estimators': [10, 25], 
    'max_features': [5, 10],
    'max_depth': [10, 50, None], 
    'bootstrap': [True, False]
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_train_embeddings, y_train)

print('\n----- TEST -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=10;, score=0.950 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=10;, score=0.912 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=10;, score=0.919 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=10;, score=0.938 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=10, n_estimators=10;, score=0.931 total time=   0.0s
[CV 1/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.950 total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.969 total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=None, max_features=5, n_estimators=25;, score=0.925 total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=None, max_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.0s finished





_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[410   0]
 [  0 390]] 

Accuracy: 1.0 

F1 Score: 1.0 


----- TEST -----

Confusion matrix
 [[99  4]
 [ 4 93]] 

Accuracy: 0.96 

F1 Score: 0.9587628865979382 

_______________________________________________________________________


### SVC

In [None]:
class SVClassifier:
    def __init__(self, C = 1, kernel='linear', gamma = 0.2):
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        self.model = None

    def fit(self, X_train, y_train):
        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma, verbose=True)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = svm.SVC()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.C = random_search.best_params_['C']
        self.kernel = random_search.best_params_['kernel']
        self.gamma = random_search.best_params_['gamma']

        self.model = svm.SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

In [None]:
# Instantiate classifier
classifier = SVClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'C': [1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale']
    #'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_train_embeddings, y_train)

print('\n----- TEST -----')
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test)
print('_______________________________________________________________________')



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.988 total time=   0.0s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 3/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 4/5] END ...C=1, gamma=scale, kernel=linear;, score=0.975 total time=   0.0s
[CV 5/5] END ...C=1, gamma=scale, kernel=linear;, score=0.994 total time=   0.0s
[CV 1/5] END .....C=1, gamma=scale, kernel=poly;, score=0.981 total time=   0.1s
[CV 2/5] END .....C=1, gamma=scale, kernel=poly;, score=0.988 total time=   0.0s
[CV 3/5] END .....C=1, gamma=scale, kernel=poly;, score=0.956 total time=   0.0s
[CV 4/5] END .....C=1, gamma=scale, kernel=poly;, score=0.975 total time=   0.1s
[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.969 total time=   0.1s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.994 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;

### Logistic Regression

In [None]:
class LRClassifier:
    def __init__(self, penalty = 'l2', solver = 'libinear', C = 0.5):
        self.penalty = penalty
        self.solver = solver
        self.C = C
        self.model = None

    def fit(self, X_train, y_train):
        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        if self.model is None:
          raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.model.predict(X_test)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = LogisticRegression()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train, y_train)

        self.penalty = random_search.best_params_['penalty']
        self.solver = random_search.best_params_['solver']
        self.C = random_search.best_params_['C']

        self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)

    def evaluate(self, X_test, y_test):
        if self.model is None:
              raise ValueError("The model has not been trained yet. Please call 'fit' first.")
        y_pred = self.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print('\nConfusion matrix\n', cm, '\n')
        print('Accuracy:', accuracy, '\n')
        print('F1 Score:', f1, '\n')

        return cm, accuracy, f1

In [None]:
# Instantiate classifier
classifier = LRClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C' : np.arange(0, 1, 0.01)
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier
print('\n')
print('\n_______________________________________________________________________')
print('EVALUATION')

print('\n----- TRAIN -----')
cm, accuracy, f1 = classifier.evaluate(X_train_embeddings, y_train)

print('\n----- TEST -----')
cm, accuracy, f1 = classifier.evaluate(X_test_embeddings, y_test)
print('_______________________________________________________________________')

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV 1/5] END ..C=0.18, penalty=none, solver=sag;, score=0.981 total time=   0.7s




[CV 2/5] END ..C=0.18, penalty=none, solver=sag;, score=0.963 total time=   0.7s




[CV 3/5] END ..C=0.18, penalty=none, solver=sag;, score=0.975 total time=   0.8s




[CV 4/5] END ..C=0.18, penalty=none, solver=sag;, score=0.975 total time=   0.8s




[CV 5/5] END ..C=0.18, penalty=none, solver=sag;, score=0.994 total time=   0.8s
[CV 1/5] END C=0.39, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END C=0.39, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END C=0.39, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END C=0.39, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END C=0.39, penalty=elasticnet, solver=saga;, score=nan total time=   0.0s




[CV 1/5] END ..C=0.9, penalty=none, solver=saga;, score=0.981 total time=   0.8s




[CV 2/5] END ..C=0.9, penalty=none, solver=saga;, score=0.969 total time=   0.6s




[CV 3/5] END ..C=0.9, penalty=none, solver=saga;, score=0.969 total time=   0.6s




[CV 4/5] END ..C=0.9, penalty=none, solver=saga;, score=0.981 total time=   0.6s




[CV 5/5] END ..C=0.9, penalty=none, solver=saga;, score=0.994 total time=   0.6s




[CV 1/5] END C=0.8300000000000001, penalty=l2, solver=sag;, score=0.981 total time=   0.5s




[CV 2/5] END C=0.8300000000000001, penalty=l2, solver=sag;, score=0.969 total time=   0.5s




[CV 3/5] END C=0.8300000000000001, penalty=l2, solver=sag;, score=0.969 total time=   0.5s




[CV 4/5] END C=0.8300000000000001, penalty=l2, solver=sag;, score=0.975 total time=   0.7s




[CV 5/5] END C=0.8300000000000001, penalty=l2, solver=sag;, score=0.994 total time=   0.7s
[CV 1/5] END C=0.26, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.26, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.26, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.26, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.26, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END C=0.8, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.8, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END C=0.8, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END C=0.8, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END C=0.8, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END C=0.78, penalty=elasticnet, solver=saga;,

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END ..C=0.36, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.1s
[CV 4/5] END ..C=0.36, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END ..C=0.36, penalty=l2, solver=lbfgs;, score=0.988 total time=   0.1s
[CV 1/5] END C=0.75, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.75, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.75, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.75, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.75, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ..C=0.05, penalty=l2, solver=lbfgs;, score=0.975 total time=   0.1s
[CV 2/5] END ..C=0.05, penalty=l2, solver=lbfgs;, score=0.963 total time=   0.1s
[CV 3/5] END ..C=0.05, penalty=l2, solver=lbfgs;, score=0.931 total time=   0.1s
[CV 4/5] END ..C=0.05, penalty=l2, solver=lbfgs;, score=0.969 total time=   0.1s
[CV 5/5] END ..C=0.05, penalty=l2, solver=lbfgs;, score=0.956 total time=   0.1s
[CV 1/5] END C=0.39, penalty=elasticnet, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END C=0.39, penal



[CV 1/5] END .C=0.56, penalty=none, solver=saga;, score=0.981 total time=   0.8s




[CV 2/5] END .C=0.56, penalty=none, solver=saga;, score=0.969 total time=   0.6s




[CV 3/5] END .C=0.56, penalty=none, solver=saga;, score=0.969 total time=   0.6s




[CV 4/5] END .C=0.56, penalty=none, solver=saga;, score=0.981 total time=   0.6s




[CV 5/5] END .C=0.56, penalty=none, solver=saga;, score=0.994 total time=   0.6s
[CV 1/5] END ....C=0.97, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=0.97, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ....C=0.97, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ....C=0.97, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ....C=0.97, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s




[CV 1/5] END C=0.41000000000000003, penalty=l1, solver=saga;, score=0.969 total time=   0.7s




[CV 2/5] END C=0.41000000000000003, penalty=l1, solver=saga;, score=0.956 total time=   0.7s




[CV 3/5] END C=0.41000000000000003, penalty=l1, solver=saga;, score=0.944 total time=   0.7s




[CV 4/5] END C=0.41000000000000003, penalty=l1, solver=saga;, score=0.969 total time=   0.7s


55 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 1048, in __cal

[CV 5/5] END C=0.41000000000000003, penalty=l1, solver=saga;, score=0.963 total time=   0.7s







_______________________________________________________________________
EVALUATION

----- TRAIN -----

Confusion matrix
 [[410   0]
 [  0 390]] 

Accuracy: 1.0 

F1 Score: 1.0 


----- TEST -----

Confusion matrix
 [[102   1]
 [  1  96]] 

Accuracy: 0.99 

F1 Score: 0.9896907216494846 

_______________________________________________________________________




In [None]:
# # PLOTTING ROC EXAMPLE

# class LRClassifier:
#     def __init__(self, penalty = 'l2', solver = 'libinear', C = 0.5):
#         self.penalty = penalty
#         self.solver = solver
#         self.C = C

#     def fit(self, X_train, y_train):
#         self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)
#         self.model.fit(X_train, y_train)

#     def predict(self, X_test, proba=False):
#         if(proba):
#           y_pred = self.model.predict_proba(X_test)
#           return y_pred

#         else:
#           y_pred = self.model.predict(X_test)
#           return y_pred


#     def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
#         self.model = LogisticRegression()
#         random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
#         random_search.fit(X_train, y_train)

#         self.penalty = random_search.best_params_['penalty']
#         self.solver = random_search.best_params_['solver']
#         self.C = random_search.best_params_['C']

#         self.model = LogisticRegression(penalty=self.penalty, solver=self.solver, C=self.C)

#     def evaluate(self, X_test, y_test, plot_roc):
#         y_pred = self.predict(X_test)

#         print('Confusion matrix\n', confusion_matrix(y_test, y_pred))
#         print('\nAccuracy:', accuracy_score(y_test, y_pred))
#         print('\nF1 Score:', f1_score(y_test, y_pred))

#         if(plot_roc):
#           y_pred_proba = classifier.predict(X_test, proba=True)[::,1]
#           fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
#           auc = metrics.roc_auc_score(y_test, y_pred_proba)

#           #create ROC curve
#           plt.plot(fpr,tpr,label="AUC="+str(auc))
#           plt.ylabel('True Positive Rate')
#           plt.xlabel('False Positive Rate')
#           plt.legend(loc=4)
#           plt.show()


#         return confusion_matrix(y_test, y_pred), accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)


# # Instantiate classifier
# classifier = LRClassifier()

# # Perform randomized search over hyperparameters
# param_distributions = {
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'C' : np.arange(0, 1, 0.01)
# }
# classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# # Train classifier on training data
# classifier.fit(X_train_embeddings, y_train)

# # Evaluate classifier
# print('\n')
# print('\n_______________________________________________________________________')
# print('EVALUATION')
# print('\n----- TRAIN -----')
# conf_matrix, accuracy, f1_sc = classifier.evaluate(X_train_embeddings, y_train, plot_roc = True)
# print('\n')
# print('\n----- TEST -----')
# conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test, plot_roc = True)
# print('_______________________________________________________________________')

## ... embeddings

### KNN

In [None]:
# WIP

### XGBoost

In [None]:
# WIP

### Random Forest

In [None]:
# WIP

### SVC

In [None]:
# WIP

### Logistic Regression

In [None]:
# WIP

---