# BERT word embedding + XGBoost WRAPPER
## First working wrapper

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig
import gc
from transformers import BertModel
from sklearn.metrics import roc_auc_score,f1_score
import time
import datetime
from bs4 import BeautifulSoup
import re,string,unicodedata
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

import tensorflow as tf
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

fake = pd.read_csv('/content/drive/MyDrive/thesis-data/Fake.csv')
true = pd.read_csv('/content/drive/MyDrive/thesis-data/True.csv')

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index = True)

df['text'] = df['title'] + " " + df['text']
df.drop(columns=['title', 'date', 'subject'], inplace = True)

In [None]:
nltk.download('stopwords')

stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
    
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

#Apply function on review column
df['text']=df['text'].apply(denoise_text)

In [None]:
df_original = df.copy()
df = df.sample(frac=1).reset_index(drop=True)[:1000]

Wrapper

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel


class XGBClassifier:
    def __init__(self, max_depth=5, n_estimators=100, learning_rate=0.1):
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.bert = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

    def _get_bert_embedding(self, text):
        input_ids = self.tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=64)
        input_ids = np.array(input_ids)
        input_ids = np.expand_dims(input_ids, axis=0)
        input_ids = torch.tensor(input_ids)

        with torch.no_grad():
            outputs = self.bert(input_ids)
            last_hidden_state = outputs.last_hidden_state
            last_hidden_state = last_hidden_state[:, 0, :].numpy()

        return last_hidden_state

    def fit(self, X_train, y_train):
        X_train_embeddings = []
        for text in X_train:
            embedding = self._get_bert_embedding(text)
            X_train_embeddings.append(embedding)
        X_train_embeddings = np.array(X_train_embeddings)
        X_train_embeddings = np.squeeze(X_train_embeddings, axis=1)

        self.model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, learning_rate=self.learning_rate)
        self.model.fit(X_train_embeddings, y_train)

    def predict(self, X_test):
        X_test_embeddings = []
        for text in X_test:
            embedding = self._get_bert_embedding(text)
            X_test_embeddings.append(embedding)
        X_test_embeddings = np.array(X_test_embeddings)
        X_test_embeddings = np.squeeze(X_test_embeddings, axis=1)

        y_pred = self.model.predict(X_test_embeddings)

        return y_pred

    def randomized_search(self, X_train, y_train, param_distributions, cv=2, n_iter=1):
        X_train_embeddings = []
        for text in X_train:
            embedding = self._get_bert_embedding(text)
            X_train_embeddings.append(embedding)
        X_train_embeddings = np.array(X_train_embeddings)
        X_train_embeddings = np.squeeze(X_train_embeddings, axis=1)

        self.model = xgb.XGBClassifier()
        random_search = RandomizedSearchCV(self.model, param_distributions=param_distributions, cv=cv, n_iter=n_iter)
        random_search.fit(X_train_embeddings, y_train)

        self.max_depth = random_search.best_params_['max_depth']
        self.n_estimators = random_search.best_params_['n_estimators']
        self.learning_rate = random_search.best_params_['learning_rate']

        self.model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_estimators, learning_rate=self.learning_rate)

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        return acc

In [None]:
# Load data
#df = pd.read_csv('data.csv')
X = df['text'].tolist()
y = df['label'].tolist()

# Split data into training and test sets
train_size = int(0.8 * len(X))
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# Instantiate classifier
classifier = XGBClassifier()

# Perform randomized search over hyperparameters
param_distributions = {
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5]
}
classifier.randomized_search(X_train, y_train, param_distributions)

# Train classifier on training data
classifier.fit(X_train, y_train)

# Evaluate classifier on test data
accuracy = classifier.evaluate(X_test, y_test)
print('Accuracy:', accuracy)