Setup PyTorch to use best hardware option

In [1]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


Get model and tokenizer from Hugginface

In [2]:
from transformers import DebertaV2Tokenizer, DebertaV2Model

model_name = "microsoft/deberta-v3-base"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = DebertaV2Model.from_pretrained(model_name).to(device)

Dataset load

In [4]:
import pandas as pd
import swifter

In [5]:
ARTIFACTS_BASE = '../../artifacts/step-1/classic-ml-ms-deberta-v3-base/'

In [6]:
# Function to convert string representation of vector to list of floats
import numpy as np

def convert_to_float_list(s):
    s = s.strip('[]')
    return np.array(s.split(), dtype=float)

In [7]:
test_df = pd.read_csv(ARTIFACTS_BASE + 'test_df.csv')
test_df['embedding'] = test_df['embedding'].swifter.apply(convert_to_float_list)
test_df.head()

Pandas Apply:   0%|          | 0/65416 [00:00<?, ?it/s]

Unnamed: 0,prompt,label,embedding
0,"Task: Given a list of words, identify which wo...",0,"[0.0470528901, 0.128666952, 0.120663926, 0.011..."
1,So I was watching episode 11 of Dark Angel sea...,0,"[-0.077564396, 0.109083176, 0.141840488, 0.140..."
2,"Given the sentence ""Lots of people standing on...",0,"[0.0886795521, 0.168291405, -0.0653214753, 0.0..."
3,"In this task, you are given the name of an Ind...",0,"[-0.00887174066, 0.0103212046, -0.156578302, 0..."
4,"Task: Replace all the sentences that use ""i"" w...",0,"[0.0129042789, 0.0639195219, -0.259266078, 0.2..."


In [8]:
train_df = pd.read_csv(ARTIFACTS_BASE + 'train_df.csv')
train_df['embedding'] = train_df['embedding'].swifter.apply(convert_to_float_list)
train_df.head()

Pandas Apply:   0%|          | 0/261738 [00:00<?, ?it/s]

Unnamed: 0,prompt,label,embedding
0,My question is: Alani earned $45 for 3 hours o...,0,"[-0.0976323485, 0.15560329, 0.0484190285, -0.0..."
1,"Q: Yes / no, is the following a verifiable fac...",0,"[0.0271683801, 0.0977859125, -0.032584101, -0...."
2,Read a code snippet and detect if there are an...,0,"[-0.00371995103, 0.226242572, 0.0596871153, 0...."
3,"As a highly advanced chatbot, it is your duty ...",1,"[0.276234031, 0.0609874725, -0.398053914, 0.04..."
4,Tell me the main idea of this paragraph.,0,"[-0.0443628579, 0.187087953, -0.0733531415, -0..."


Prepare loaded dataset for metrics counting

In [9]:
X_test = pd.DataFrame(np.array(test_df['embedding'].to_list()))
y_test = test_df["label"]

In [10]:
X_train = pd.DataFrame(np.array(train_df['embedding'].to_list()))
y_train = train_df["label"]

In [11]:
# Check number of training and testing samples
print(f"#Training Samples: {len(X_train)}")
print(f"#Testing Samples: {len(X_test)}")

#Training Samples: 261738
#Testing Samples: 65416


In [22]:
# Import classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

# Initialize estimators using their default parameters
estimators = [
    ("Naive Bayes", GaussianNB()),
    ("LGB Classifier", lgb.LGBMClassifier()),
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)),
    ("MLPClassifier", MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42)),
]

In [23]:
# Import performance metrics libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare a DataFrame to keep track of the models' performance
results = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1 score"])

# Iterate through each estimator in the list
for est_name, est_obj in estimators:
    
    # Fit the model
    est_obj.fit(X_train, y_train)
    
    # Use the model to predict unseen prompts
    y_predict = est_obj.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)  
    
    # Store performance metrics
    results.loc[est_name] = [accuracy, precision, recall, f1]

[LightGBM] [Info] Number of positive: 129232, number of negative: 132506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.405069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 261738, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493746 -> initscore=-0.025019
[LightGBM] [Info] Start training from score -0.025019


In [24]:
results

Unnamed: 0,accuracy,precision,recall,f1 score
Naive Bayes,0.74066,0.722462,0.770762,0.745831
LGB Classifier,0.939816,0.942208,0.935468,0.938826
Random Forest,0.901706,0.901316,0.899362,0.900338
MLPClassifier,0.985218,0.982325,0.987831,0.98507
