# BERT word embeddings + various classification algorithms

In [85]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertModel, AutoTokenizer, AutoModel
import gc

import time
import datetime
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.preprocessing import LabelBinarizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

import tensorflow as tf
import xgboost as xgb

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score, confusion_matrix

In [135]:
from google.colab import drive
drive.mount('/content/drive')

fake = pd.read_csv('/content/drive/MyDrive/master-thesis/thesis-data/Fake.csv')
true = pd.read_csv('/content/drive/MyDrive/master-thesis/thesis-data/True.csv')

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index = True)

df['text'] = df['title'] + " " + df['text']
df.drop(columns=['title', 'date', 'subject'], inplace = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [137]:
nltk.download('stopwords')

stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
    
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

#Apply function on review column
df['text']=df['text'].apply(denoise_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


---

Reduce dataset for testing purposes

In [138]:
df_original = df.copy()
df = df.sample(frac=1).reset_index(drop=True)[:1000]

---

# BERT Embedding

In [139]:
# Load data
X = df['text'].tolist()
y = df['label'].tolist()

# Split data into training and test sets

# # Old
# train_size = int(0.8 * len(X))
# X_train = X[:train_size]
# y_train = y[:train_size]
# X_test = X[train_size:]
# y_test = y[train_size:]

# New
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

def _get_bert_embedding(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512)
    input_ids = np.array(input_ids)
    input_ids = np.expand_dims(input_ids, axis=0)
    input_ids = torch.tensor(input_ids)

    with torch.no_grad():
        outputs = bert(input_ids)
        last_hidden_state = outputs.last_hidden_state
        last_hidden_state = last_hidden_state[:, 0, :].numpy()

    return last_hidden_state

X_train_embeddings = []
for text in X_train:
    embedding = _get_bert_embedding(text)
    X_train_embeddings.append(embedding)
X_train_embeddings = np.array(X_train_embeddings)
X_train_embeddings = np.squeeze(X_train_embeddings, axis=1)

X_test_embeddings = []
for text in X_test:
    embedding = _get_bert_embedding(text)
    X_test_embeddings.append(embedding)
X_test_embeddings = np.array(X_test_embeddings)
X_test_embeddings = np.squeeze(X_test_embeddings, axis=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [143]:
X_train_embeddings_bert = X_train_embeddings.copy()
X_test_embeddings_bert = X_test_embeddings.copy()

X_train_split = X_train.copy()
X_test_split = X_test.copy()
y_train_split = y_train.copy()
y_test_split = y_test.copy()

# # Save current state -----------------------------------------------------------
# with open("/content/drive/MyDrive/master-thesis/embeddings/X_train", "wb") as fp:
#   pickle.dump(X_train, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/X_test", "wb") as fp:
#   pickle.dump(X_test, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y_train", "wb") as fp:
#   pickle.dump(y_train, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y_test", "wb") as fp:
#   pickle.dump(y_test, fp)

# pd.DataFrame(X_train_embeddings).to_csv("/content/drive/MyDrive/master-thesis/embeddings/X_train_embeddings.csv", index=False, header=False)
# pd.DataFrame(X_test_embeddings).to_csv("/content/drive/MyDrive/master-thesis/embeddings/X_test_embeddings.csv", index=False, header=False)

# with open("/content/drive/MyDrive/master-thesis/embeddings/X", "wb") as fp:
#   pickle.dump(X, fp)
# with open("/content/drive/MyDrive/master-thesis/embeddings/y", "wb") as fp:
#   pickle.dump(y, fp)
# # ------------------------------------------------------------------------------

In [168]:
# Retrieve data ----------------------------------------------------------------
with open("/content/drive/MyDrive/master-thesis/embeddings/X_train", "rb") as fp:
  X_train = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/X_test", "rb") as fp:
  X_test = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y_train", "rb") as fp:
  y_train = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y_test", "rb") as fp:
  y_test = pickle.load(fp)

X_train_embeddings = pd.read_csv('/content/drive/MyDrive/master-thesis/embeddings/X_train_embeddings.csv', sep=',', header=None).values
X_test_embeddings = pd.read_csv('/content/drive/MyDrive/master-thesis/embeddings/X_test_embeddings.csv', sep=',', header=None).values

with open("/content/drive/MyDrive/master-thesis/embeddings/X", "rb") as fp:
  X = pickle.load(fp)
with open("/content/drive/MyDrive/master-thesis/embeddings/y", "rb") as fp:
  y = pickle.load(fp)
# ------------------------------------------------------------------------------

# Classification

## KNN

In [140]:
class KNNClassifier:
    def __init__(self, n_neighbors=2):
        self.n_neighbors = n_neighbors

    def fit(self, X_train_embeddings, y_train):
        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        self.model.fit(X_train_embeddings, y_train)

    def predict(self, X_test_embeddings):
        y_pred = self.model.predict(X_test_embeddings)
        return y_pred

    def randomized_search(self, X_train_embeddings, y_train, param_distributions, cv=5, n_iter=20):
        self.model = KNeighborsClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train_embeddings, y_train)

        self.n_neighbors = random_search.best_params_['n_neighbors']

        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors)

    def evaluate(self, X_test_embeddings, y_test):
        y_pred = self.predict(X_test_embeddings)
        return confusion_matrix(y_test, y_pred), accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [176]:
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score

# Instantiate classifier
classifier = KNNClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier on test data
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test)
print('Confusion matrix\n', conf_matrix, '\nAccuracy:', accuracy, '\nF1 Score:', f1_sc)



Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END .....................n_neighbors=2;, score=0.950 total time=   0.0s
[CV 2/5] END .....................n_neighbors=2;, score=0.944 total time=   0.0s
[CV 3/5] END .....................n_neighbors=2;, score=0.938 total time=   0.0s
[CV 4/5] END .....................n_neighbors=2;, score=0.938 total time=   0.0s
[CV 5/5] END .....................n_neighbors=2;, score=0.950 total time=   0.0s
[CV 1/5] END .....................n_neighbors=3;, score=0.944 total time=   0.0s
[CV 2/5] END .....................n_neighbors=3;, score=0.975 total time=   0.0s
[CV 3/5] END .....................n_neighbors=3;, score=0.938 total time=   0.0s
[CV 4/5] END .....................n_neighbors=3;, score=0.938 total time=   0.0s
[CV 5/5] END .....................n_neighbors=3;, score=0.938 total time=   0.0s
[CV 1/5] END .....................n_neighbors=4;, score=0.956 total time=   0.0s
[CV 2/5] END .....................n_neighbors=4;,

## XGBoost

In [160]:
class XGBClassifier:
    def __init__(self, max_depth=5, n_estimators=100, learning_rate=0.1):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate

    def fit(self, X_train, y_train):
        self.model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, learning_rate=self.learning_rate)
        self.model.fit(X_train_embeddings, y_train)

    def predict(self, X_test):
        y_pred = self.model.predict(X_test_embeddings)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=5, n_iter=20):
        self.model = xgb.XGBClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter, verbose=3)
        random_search.fit(X_train_embeddings, y_train)

        self.max_depth = random_search.best_params_['max_depth']
        self.n_estimators = random_search.best_params_['n_estimators']
        self.learning_rate = random_search.best_params_['learning_rate']

        self.model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, learning_rate=self.learning_rate)

    def evaluate(self, X_test_embeddings, y_test):
        y_pred = self.predict(X_test_embeddings)

        return confusion_matrix(y_test, y_pred), accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [161]:
# Instantiate classifier
classifier = XGBClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5]
}
classifier.randomized_search(X_train_embeddings, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train_embeddings, y_train)

# Evaluate classifier on test data
conf_matrix, accuracy, f1_sc = classifier.evaluate(X_test_embeddings, y_test)
print('Confusion matrix\n', conf_matrix, '\nAccuracy:', accuracy, '\nF1 Score:', f1_sc)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.969 total time=   5.3s
[CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.956 total time=   3.9s
[CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.944 total time=   4.0s
[CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.938 total time=   6.5s
[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=50;, score=0.956 total time=   4.1s
[CV 1/5] END learning_rate=0.01, max_depth=7, n_estimators=200;, score=0.950 total time=  32.2s
[CV 2/5] END learning_rate=0.01, max_depth=7, n_estimators=200;, score=0.944 total time=  50.7s
[CV 3/5] END learning_rate=0.01, max_depth=7, n_estimators=200;, score=0.912 total time=  36.5s
[CV 4/5] END learning_rate=0.01, max_depth=7, n_estimators=200;, score=0.919 total time=  25.4s
[CV 5/5] END learning_rate=0.01, max_depth=7, n_estimators=200;, score=0.944 total t

KeyboardInterrupt: ignored

## SVC

In [None]:
# WIP

## Logistic Regression

In [None]:
# WIP

## Random Forest

In [None]:
# WIP