# Transformers model

## Setup

In [None]:
# !pip install gensim

In [None]:
# import nltk
# nltk.download("stopwords")

In [None]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Masking, Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

from transformers import AutoTokenizer
from transformers import TFAutoModel

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from polclassifier.params import *

import gensim.downloader as api
from nltk.corpus import stopwords

In [None]:
X = pd.read_csv("~/code/szaboildi/uk-pol-speech-classifier/processed_data/features_1000sample_400min_600cutoff_for_embed.csv")
y = pd.read_csv("~/code/szaboildi/uk-pol-speech-classifier/processed_data/target_1000sample_400min_600cutoff_for_embed.csv")

In [None]:
X.shape, y.shape

## Preprocessing

### Embed the training and test sentences

feed raw data, you need text-classification tfautomodel sequence classification

In [None]:
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, padding_side = "right")

tokenized_tensors = tokenizer(X["text"].tolist(), 
                              max_length=512, 
                              padding = "max_length", 
                              truncation = True, 
                              return_tensors="tf")

model = TFAutoModel.from_pretrained(HF_MODEL, 
                                    from_pt = True, 
                                    num_labels=7)

embeddings = model.predict(tokenized_tensors["input_ids"])

X_embed = embeddings.last_hidden_state[:,0,:]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_embed, y)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping

input_shape = (128,) 

dense_model = Sequential([
    Dense(256, activation='relu', input_shape=input_shape),
    BatchNormalization(),
    Dropout(0.5),  # Add dropout for regularization
    Dense(175, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),  # Add dropout for regularization
    Dense(120, activation='relu'),
    Dense(7, activation='softmax')])

es = EarlyStopping(patience=10, restore_best_weights=True)

def lr_schedule(epoch):
    return 0.001 * np.exp(-epoch / 10)

dense_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


history = dense_model.fit(X_train, 
                          y_train, 
                          validation_split=0.2, 
                          epochs=100,
                          batch_size=32,
                          callbacks=[es, LearningRateScheduler(lr_schedule)]
                         )

dense_model.evaluate(X_test, y_test)

In [None]:
import matplotlib.pyplot as plt

def plot_learning_curve(history):
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label='val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Model Learning Curve')
    plt.legend(loc='lower right')
    plt.show()

plot_learning_curve(history)