In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
from wordcloud import WordCloud

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from transformers import BertTokenizer, BertModel
import torch

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [3]:
df_train = pd.read_csv("../bert_embedding_extracted_dataset/train.csv")
df_test = pd.read_csv("../bert_embedding_extracted_dataset/test.csv")
df_train.head()
df_test['bert_embeddings'][0]

'[[-2.62934387e-01  2.45193094e-01 -2.03275174e-01 -4.45416421e-02\n  -2.58341223e-01 -1.92095473e-01  1.02615431e-01  1.84837937e-01\n   7.11444989e-02 -3.48308891e-01 -7.13892132e-02  2.28780419e-01\n   9.06834286e-03  4.85278279e-01 -1.75721467e-01  5.55636957e-02\n  -2.31708318e-01  1.70241445e-01  1.41787067e-01 -1.48691818e-01\n  -3.93671840e-02 -1.41792074e-01  1.39422387e-01 -1.54494703e-01\n  -1.20371342e-01 -9.89510566e-02  1.28145397e-01 -2.93929964e-01\n  -4.93803956e-02  2.71961272e-01 -8.07365477e-02  2.61631399e-01\n  -3.73546965e-03 -6.66509196e-02 -4.88350764e-02 -5.99693023e-02\n   1.32757276e-01  1.17276892e-01 -1.05054397e-02  1.70126483e-02\n   1.32606655e-01  6.62934855e-02  2.03400329e-01  1.00775860e-01\n  -2.39297748e-02 -1.09988332e-01 -1.82015753e+00  1.40479431e-01\n  -1.55787051e-01 -3.59359950e-01  2.61586398e-01 -2.30629072e-02\n   2.15151444e-01  9.76368189e-02 -3.12021524e-02  2.36023515e-01\n  -1.57683536e-01  5.57746828e-01  1.41587362e-01  8.61630738

In [4]:
import ast
import re

def str_to_numpy(embedding_str):
    # Step 1: Clean up the string by removing newlines and extra spaces
    embedding_str_clean = embedding_str.replace('\n', ' ').strip()
    
    # Step 2: Insert commas between numbers (if they are not already present)
    embedding_str_clean = re.sub(r'(?<=\d)\s+(?=-?\d)', ', ', embedding_str_clean)
    
    # Step 3: Convert the cleaned string to a list of lists using ast.literal_eval
    embedding_list = ast.literal_eval(embedding_str_clean)
    
    # Step 4: Convert the list to a NumPy array
    embedding_array = np.array(embedding_list)
    
    # Step 5: Ensure the array is 1D (or flatten it if needed)
    if embedding_array.ndim > 1:
        embedding_array = embedding_array.flatten()
    
    return embedding_array

df_train['bert_embeddings_new'] = df_train['bert_embeddings'].apply(str_to_numpy)
df_test['bert_embeddings_new'] = df_test['bert_embeddings'].apply(str_to_numpy)


In [5]:
df_train['bert_embeddings_new']

0       [0.269839019, 0.0551486239, 0.241333127, -0.32...
1       [0.132323921, 0.499198824, -0.256525457, -0.16...
2       [0.252725661, 0.280099034, 0.280774415, -0.042...
3       [-0.135386035, 0.0742997676, 0.00502018258, 0....
4       [-0.535640419, 0.502437592, -0.577307105, -0.3...
                              ...                        
1364    [-0.0378334373, 0.120986767, 0.0127903856, 0.1...
1365    [-0.242620692, 0.115115844, 0.227484077, -0.17...
1366    [0.19805181, -0.0518193357, -0.282636762, -0.0...
1367    [0.0362141542, 0.0803106651, 0.0854992419, 0.1...
1368    [-0.12382482, -0.151536196, 0.115754187, -0.24...
Name: bert_embeddings_new, Length: 1369, dtype: object

In [6]:
X = np.array(df_train['bert_embeddings_new'].tolist())  # List of 1D arrays to 2D array
y = df_train['CLASS']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=100)




In [7]:
X.shape

(1369, 768)

In [8]:
lr_model = LogisticRegression()
lr_l1_model = LogisticRegression(penalty='l1', solver='liblinear')
bnb_model = BernoulliNB()
# mnb_model = MultinomialNB()

lr_model.fit(X_train, y_train)
lr_l1_model.fit(X_train, y_train)
bnb_model.fit(X_train, y_train)
# mnb_model.fit(X_train, y_train)

In [9]:
lr_validation_predictions = lr_model.predict(X_validation)
lr_l1_validation_predictions = lr_l1_model.predict(X_validation)
bnb_validation_predictions = bnb_model.predict(X_validation)

# Calculate accuracy for each model
lr_validation_accuracy = accuracy_score(y_validation, lr_validation_predictions)
lr_l1_validation_accuracy = accuracy_score(y_validation, lr_l1_validation_predictions)
bnb_validation_accuracy = accuracy_score(y_validation, bnb_validation_predictions)

# Print the accuracy scores
print(f'Logistic Regression Accuracy: {lr_validation_accuracy:.4f}')
print(f'Logistic Regression (L1 regularization) Accuracy: {lr_l1_validation_accuracy:.4f}')
print(f'BernoulliNB Accuracy: {bnb_validation_accuracy:.4f}')

Logistic Regression Accuracy: 0.9599
Logistic Regression (L1 regularization) Accuracy: 0.9343
BernoulliNB Accuracy: 0.8759


In [10]:
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=10, scoring='accuracy')
lr_l1_cv_scores = cross_val_score(lr_l1_model, X_train, y_train, cv=10, scoring='accuracy')
bnb_cv_scores = cross_val_score(bnb_model, X_train, y_train, cv=10, scoring='accuracy')

print(f'Logistic Regression CV Accuracy: {lr_cv_scores.mean():.4f}')
print(f'Logistic Regression (L1 regularization) CV Accuracy: {lr_l1_cv_scores.mean():.4f}')
print(f'BernoulliNB CV Accuracy: {bnb_cv_scores.mean():.4f}')

Logistic Regression CV Accuracy: 0.9351
Logistic Regression (L1 regularization) CV Accuracy: 0.9324
BernoulliNB CV Accuracy: 0.8666


In [11]:
lr_prediction_model = LogisticRegression()
X_train = np.array(df_train['bert_embeddings_new'].tolist())  # List of 1D arrays to 2D array
y_train = df_train['CLASS']

X_test = np.array(df_test['bert_embeddings_new'].tolist())  # List of 1D arrays to 2D array

lr_prediction_model.fit(X_train, y_train)
lr_predictions_submission = lr_prediction_model.predict(X_test)

In [12]:
sol = pd.read_csv("../sample_submission.csv")
sol['CLASS'] = lr_predictions_submission
sol.head()
sol.to_csv("lr_prediction.csv", index=False)