Setup PyTorch to use best hardware option

In [1]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


Get model and tokenizer from Hugginface

In [4]:
from transformers import AutoTokenizer, AutoModel

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = AutoModel.from_pretrained(model_name).to(device)

Dataset load

In [5]:
import pandas as pd
import swifter

In [9]:
ARTIFACTS_BASE = '../../../artifacts/step-1-classic-ml/distilbert-base-uncased-finetuned-sst-2-english/'

In [7]:
# Function to convert string representation of vector to list of floats
import numpy as np

def convert_to_float_list(s):
    s = s.strip('[]')
    return np.array(s.split(), dtype=float)

In [10]:
test_df = pd.read_pickle(ARTIFACTS_BASE + 'test_df.pkl')
test_df['embedding'] = test_df['embedding']
test_df.head()

Unnamed: 0,prompt,label,embedding
0,"Task: Given a list of words, identify which wo...",0,"[-0.45421112, 0.6041618, -0.19468212, -0.22714..."
1,So I was watching episode 11 of Dark Angel sea...,0,"[-0.0038527825, 0.48712388, 0.1536091, -0.1433..."
2,"Given the sentence ""Lots of people standing on...",0,"[-0.4880085, 0.48155108, -0.31909367, -0.34127..."
3,"In this task, you are given the name of an Ind...",0,"[-0.28547662, 0.66922814, -0.32103178, -0.4978..."
4,"Task: Replace all the sentences that use ""i"" w...",0,"[-0.123003766, 0.833054, -0.6437457, -0.428326..."


In [11]:
train_df = pd.read_pickle(ARTIFACTS_BASE + 'train_df.pkl')
train_df['embedding'] = train_df['embedding']
train_df.head()

Unnamed: 0,prompt,label,embedding
0,My question is: Alani earned $45 for 3 hours o...,0,"[-0.47111386, 0.7092997, -0.26865613, -0.34150..."
1,"Q: Yes / no, is the following a verifiable fac...",0,"[0.46102738, 0.25961348, 0.30364105, -0.277400..."
2,Read a code snippet and detect if there are an...,0,"[-0.4469538, 0.44082227, -0.46384552, -0.48993..."
3,"As a highly advanced chatbot, it is your duty ...",1,"[0.05731096, 0.44351056, 0.41152436, -0.321668..."
4,Tell me the main idea of this paragraph.,0,"[0.030675814, 0.5953852, 0.12635949, -0.506669..."


Prepare loaded dataset for metrics counting

In [12]:
X_test = pd.DataFrame(np.array(test_df['embedding'].to_list()))
y_test = test_df["label"]

In [13]:
X_train = pd.DataFrame(np.array(train_df['embedding'].to_list()))
y_train = train_df["label"]

In [14]:
# Check number of training and testing samples
print(f"#Training Samples: {len(X_train)}")
print(f"#Testing Samples: {len(X_test)}")

#Training Samples: 261738
#Testing Samples: 65416


In [15]:
# Import classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

# Initialize estimators using their default parameters
estimators = [
    ("Naive Bayes", GaussianNB()),
    ("LGB Classifier", lgb.LGBMClassifier()),
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)),
    ("MLPClassifier", MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42)),
]

In [21]:
# Import performance metrics libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare a DataFrame to keep track of the models' performance
results = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1 score"])

# Iterate through each estimator in the list
for est_name, est_obj in estimators:

    if hasattr(est_obj, 'to'):
        est_obj.to(device)  # Move model to GPU if supported
    
    # Fit the model
    est_obj.fit(X_train, y_train)
    
    # Use the model to predict unseen prompts
    y_predict = est_obj.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)  
    
    # Store performance metrics
    results.loc[est_name] = [accuracy, precision, recall, f1]

[LightGBM] [Info] Number of positive: 129232, number of negative: 132506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.507995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 261738, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493746 -> initscore=-0.025019
[LightGBM] [Info] Start training from score -0.025019




In [22]:
results

Unnamed: 0,accuracy,precision,recall,f1 score
Naive Bayes,0.696114,0.674756,0.742181,0.706864
LGB Classifier,0.931638,0.934041,0.926983,0.930499
Random Forest,0.889996,0.878119,0.902428,0.890107
MLPClassifier,0.977544,0.975445,0.97916,0.977299
