# Experiment Notebook

This notebook will contain the steps to run the experiments for this.

In [6]:
import logging
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import time
import random 

from modeling import rf, svm, lstm, gaussian
from feature_engineering import extract_features
from dataset_handling import book_train_test_split, load_dataset

LOGGER_NAME = "proj2_logger"

class Model:
    def __init__(self, df: pd.DataFrame):
        self.logger = logging.getLogger(LOGGER_NAME)

    def create_features(self, df: pd.DataFrame):
        raise NotImplementedError("Function was not implemented in subclass")
    def fit(self) -> None:
        raise NotImplementedError("Function was not implemented in subclass")
    def predict(self) -> []:
        raise NotImplementedError("Function was not implemented in subclass")

class TransformerModel(Model):
    def create_features(self, df: pd.DataFrame):
        # split df into pre-created train-test groups
        self.train_df = df[df['is_train']]
        self.test_df = df[~df['is_train']]

    def fit(self):
        # fit transformer
        return None
    
    def predict(self):
        
        self.tfidf, self.embeddings = extract_features(df)
        self.labels = df['author_id']

    def fit(self):
        '''
        To avoid reusing code, this function does nothing, as
        the per-model functions already train and then test
        '''
        return None
    
    def predict(self):
        # run all models and return metrics
        functions = [rf, gaussian, svm, lstm]
        metrics_arr = []
        for feature_type in ['glove', 'tfidf']:
            self.logger.debug(f"Processing {feature_type} features")
            features = self.tfidf if feature_type == "tfidf" else self.embeddings
            for function in functions:
                try:
                    start_time = time.time()
                    self.logger.debug(f"Beginning testing of {function.__name__} with {feature_type} features")
                    metrics, classification_report, pr = function(features, self.labels)
                    self.logger.debug(classification_report)
                    self.logger.debug(f"Finished testing of {function.__name__} with {feature_type} features (took {time.time() - start_time} seconds)")
                    metrics_arr.append([function.__name__, feature_type, *metrics]
                except Exception as e:
                    print(e)  
            self.logger.debug(f"Finished processing {feature_type} features")
        return metrics_arr

def experiment(datafile_path='data/dataset.parquet'):
    # load dataset
    # run train-test-split on df (will produce label column)
    df = book_train_test_split(load_dataset(datafile_path))
    models = [TransformerModel(), ClassicalModels()]
    metrics = []
    for model in models:
        model.create_features(df)
        model.fit()
        metrics += model.predict()

    metrics_df = pd.DataFrame(metrics, columns=['model_name', 'data_type', 'time', 'accuracy'])
        

2024-11-18 21:18:46.779217: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731964726.796679   79142 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731964726.802108   79142 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 21:18:46.823079: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = book_train_test_split(load_dataset())


In [8]:
df = book_train_test_split(load_dataset("data/primary_authors_dataset.parquet"))


In [9]:
tfidf, vecs = extract_features(df)

GloVe embeddings already exist.
Loaded 2196008 word vectors.
Average Number of Words not in Embedding Vocab: 3.4205376400652323
Embeddings saved to document_embeddings.npy
Computing TF-IDF scores...
Average Number of Words not in Embedding Vocab: 955.8599084644115
Embeddings saved to document_embeddings_tfidf.npy
Extracting TF-IDF features...


In [11]:
tfidf.to_csv("data/primary_authors_tfidf.csv", index=False)

In [None]:
tfidf, vecs = extract_features(df)

GloVe embeddings already exist.
Loaded 2196008 word vectors.
Average Number of Words not in Embedding Vocab: 5.076604691522193
Embeddings saved to document_embeddings.npy
Computing TF-IDF scores...
