# Experiment Notebook

This notebook will contain the steps to run the experiments for this.

In [1]:
import logging
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import time
import random 

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset, random_split
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from modeling import rf, svm, lstm, gaussian
from feature_engineering import extract_features
from dataset_handling import book_train_test_split, load_dataset

2024-11-21 18:36:06.173061: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732214166.343111   27413 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732214166.389210   27413 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 18:36:06.814820: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:


LOGGER_NAME = "proj2_logger"

class Model:
    def __init__(self, df: pd.DataFrame):
        self.logger = logging.getLogger(LOGGER_NAME)

    def create_features(self, df: pd.DataFrame):
        raise NotImplementedError("Function was not implemented in subclass")
    def fit(self) -> None:
        raise NotImplementedError("Function was not implemented in subclass")
    def predict(self) -> []:
        '''
        Run the model against the test partition of the dataset.

        Returns metrics: Time, Accuracy, F1-Score, Precision, Recall
        '''
        raise NotImplementedError("Function was not implemented in subclass")

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


class TransformerModel(Model):
    def create_features(self, df: pd.DataFrame):
        # split df into pre-created train-test groups
        self.num_labels = len(df.author_id.unique())
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        max_len = 128
        dataset = CustomDataset(df.text, df.author_id, tokenizer, max_len)
        train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
        self.train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        self.test_loader = DataLoader(test_dataset, batch_size=16)
    
    def fit(self):
        # fit transformer
        self.start_time = time.time()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.num_labels) 
        model = model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
        loss_fn = torch.nn.CrossEntropyLoss()
        
        # Training
        epochs = 3
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for batch in self.train_loader:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
        
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
        
            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_loss:.4f}")
        
        return None
    def predict() -> []:
        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
        
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, axis=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        duration = time.time() - self.start_time
        
        
        # Metric
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)
        
        return [['transformer', 'embeddings', 'test', duration, accuracy, f1, precision, recall]]

class ClassicalModels(Model):
    def create_features(self, df: pd.DataFrame):
        self.tfidf, self.embeddings = extract_features(df)
        self.labels = df['author_id']

    def fit(self):
        '''
        To avoid reusing code, this function does nothing, as
        the per-model functions already train and then test
        '''
        return None
    
    def predict(self):
        # run all models and return metrics
        functions = [rf, gaussian, svm, lstm]
        metrics_arr = []
        for feature_type in ['glove', 'tfidf']:
            self.logger.debug(f"Processing {feature_type} features")
            features = self.tfidf if feature_type == "tfidf" else self.embeddings
            for function in functions:
                try:
                    start_time = time.time()
                    self.logger.debug(f"Beginning testing of {function.__name__} with {feature_type} features")
                    metrics, classification_report, pr = function(features, self.labels)
                    self.logger.debug(classification_report)
                    self.logger.debug(f"Finished testing of {function.__name__} with {feature_type} features (took {time.time() - start_time} seconds)")
                    metrics_arr.append([function.__name__, feature_type, 'test', *metrics]
                except Exception as e:
                    print(e)  
            self.logger.debug(f"Finished processing {feature_type} features")
        return metrics_arr

def experiment(datafile_path='data/dataset.parquet'):
    # load dataset
    # run train-test-split on df (will produce label column)
    df = book_train_test_split(load_dataset(datafile_path))
    models = [TransformerModel(), ClassicalModels()]
    metrics = []
    for model in models:
        model.create_features(df)
        model.fit()
        metrics += model.predict()

    metrics_df = pd.DataFrame(metrics, columns=['model_name', 'data_type', 'phase', 'time', 'accuracy'])
        

2024-11-18 21:18:46.779217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731964726.796679   79142 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731964726.802108   79142 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 21:18:46.823079: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = book_train_test_split(load_dataset())


In [8]:
df = book_train_test_split(load_dataset("data/primary_authors_dataset.parquet"))


In [9]:
tfidf, vecs = extract_features(df)

GloVe embeddings already exist.
Loaded 2196008 word vectors.
Average Number of Words not in Embedding Vocab: 3.4205376400652323
Embeddings saved to document_embeddings.npy
Computing TF-IDF scores...
Average Number of Words not in Embedding Vocab: 955.8599084644115
Embeddings saved to document_embeddings_tfidf.npy
Extracting TF-IDF features...


In [11]:
tfidf.to_csv("data/primary_authors_tfidf.csv", index=False)

In [None]:
tfidf, vecs = extract_features(df)

GloVe embeddings already exist.
Loaded 2196008 word vectors.
Average Number of Words not in Embedding Vocab: 5.076604691522193
Embeddings saved to document_embeddings.npy
Computing TF-IDF scores...


In [1]:
import numpy as np

In [10]:
arr = [i * np.ones(shape=9) for i in range(1, 10)]
print(arr)
np.mean(arr, axis=0)

[array([1., 1., 1., 1., 1., 1., 1., 1., 1.]), array([2., 2., 2., 2., 2., 2., 2., 2., 2.]), array([3., 3., 3., 3., 3., 3., 3., 3., 3.]), array([4., 4., 4., 4., 4., 4., 4., 4., 4.]), array([5., 5., 5., 5., 5., 5., 5., 5., 5.]), array([6., 6., 6., 6., 6., 6., 6., 6., 6.]), array([7., 7., 7., 7., 7., 7., 7., 7., 7.]), array([8., 8., 8., 8., 8., 8., 8., 8., 8.]), array([9., 9., 9., 9., 9., 9., 9., 9., 9.])]


array([5., 5., 5., 5., 5., 5., 5., 5., 5.])

In [13]:
(1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9) / 9

5.0