In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
from wordcloud import WordCloud

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from transformers import BertTokenizer, BertModel
import torch

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [5]:
df_train = pd.read_csv("../t5_embedding_extracted_dataset/train.csv")
df_test = pd.read_csv("../t5_embedding_extracted_dataset/test.csv")
df_train.head()
df_test['t5_embeddings'].shape

(587,)

In [6]:
import ast
import re

def str_to_numpy(embedding_str):
    # Step 1: Clean up the string by removing newlines and extra spaces
    embedding_str_clean = embedding_str.replace('\n', ' ').strip()
    
    # Step 2: Insert commas between numbers (if they are not already present)
    embedding_str_clean = re.sub(r'(?<=\d)\s+(?=-?\d)', ', ', embedding_str_clean)
    
    # Step 3: Convert the cleaned string to a list of lists using ast.literal_eval
    embedding_list = ast.literal_eval(embedding_str_clean)
    
    # Step 4: Convert the list to a NumPy array
    embedding_array = np.array(embedding_list)
    
    # Step 5: Ensure the array is 1D (or flatten it if needed)
    if embedding_array.ndim > 1:
        embedding_array = embedding_array.flatten()
    
    return embedding_array

df_train['t5_embeddings_new'] = df_train['t5_embeddings'].apply(str_to_numpy)
df_test['t5_embeddings_new'] = df_test['t5_embeddings'].apply(str_to_numpy)


In [7]:
X = np.array(df_train['t5_embeddings_new'].tolist())  # List of 1D arrays to 2D array
y = df_train['CLASS']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=95)

In [8]:
lr_model = LogisticRegression(solver='newton-cg')
lr_l1_model = LogisticRegression(penalty='l1', solver='liblinear')
bnb_model = BernoulliNB()
# mnb_model = MultinomialNB()

lr_model.fit(X_train, y_train)
lr_l1_model.fit(X_train, y_train)
bnb_model.fit(X_train, y_train)
# mnb_model.fit(X_train, y_train)

In [9]:
lr_validation_predictions = lr_model.predict(X_validation)
lr_l1_validation_predictions = lr_l1_model.predict(X_validation)
bnb_validation_predictions = bnb_model.predict(X_validation)

# Calculate accuracy for each model
lr_validation_accuracy = accuracy_score(y_validation, lr_validation_predictions)
lr_l1_validation_accuracy = accuracy_score(y_validation, lr_l1_validation_predictions)
bnb_validation_accuracy = accuracy_score(y_validation, bnb_validation_predictions)

# Print the accuracy scores
print(f'Logistic Regression Accuracy: {lr_validation_accuracy:.4f}')
print(f'Logistic Regression (L1 regularization) Accuracy: {lr_l1_validation_accuracy:.4f}')
print(f'BernoulliNB Accuracy: {bnb_validation_accuracy:.4f}')

Logistic Regression Accuracy: 0.9380
Logistic Regression (L1 regularization) Accuracy: 0.9307
BernoulliNB Accuracy: 0.8358


In [10]:
lr_cv_scores = cross_val_score(lr_model, X, y, cv=10, scoring='accuracy')
lr_l1_cv_scores = cross_val_score(lr_l1_model, X, y, cv=10, scoring='accuracy')
bnb_cv_scores = cross_val_score(bnb_model, X, y, cv=10, scoring='accuracy')

print(f'Logistic Regression CV Accuracy: {lr_cv_scores.mean():.4f}')
print(f'Logistic Regression (L1 regularization) CV Accuracy: {lr_l1_cv_scores.mean():.4f}')
print(f'BernoulliNB CV Accuracy: {bnb_cv_scores.mean():.4f}')

Logistic Regression CV Accuracy: 0.9145
Logistic Regression (L1 regularization) CV Accuracy: 0.9160
BernoulliNB CV Accuracy: 0.8423
