Setup PyTorch to use best hardware option

In [1]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


Get model and tokenizer from Hugginface

In [2]:
from transformers import DebertaV2Tokenizer, DebertaV2Model

model_name = "protectai/deberta-v3-base-prompt-injection"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = DebertaV2Model.from_pretrained(model_name).to(device)

Dataset load

In [3]:
import pandas as pd
import swifter

In [4]:
ARTIFACTS_BASE = '../../../artifacts/step-1-classic-ml/deberta-v3-base-prompt-injection/'

In [5]:
# Function to convert string representation of vector to list of floats
import numpy as np

def convert_to_float_list(s):
    s = s.strip('[]')
    return np.array(s.split(), dtype=float)

In [6]:
test_df = pd.read_pickle(ARTIFACTS_BASE + 'test_df.pkl')
test_df['embedding'] = test_df['embedding']
test_df.head()

Unnamed: 0,prompt,label,embedding
0,"Task: Given a list of words, identify which wo...",0,"[-0.00074208557, 0.21072565, 0.111954555, -0.3..."
1,So I was watching episode 11 of Dark Angel sea...,0,"[-0.06304654, 0.30057973, 0.14764543, -0.24338..."
2,"Given the sentence ""Lots of people standing on...",0,"[-0.06304699, 0.30057883, 0.147644, -0.2433862..."
3,"In this task, you are given the name of an Ind...",0,"[-0.04275283, 0.13269465, 0.02573734, -0.21450..."
4,"Task: Replace all the sentences that use ""i"" w...",0,"[-0.06304706, 0.3005792, 0.14764135, -0.243386..."


In [7]:
train_df = pd.read_pickle(ARTIFACTS_BASE + 'train_df.pkl')
train_df['embedding'] = train_df['embedding']
train_df.head()

Unnamed: 0,prompt,label,embedding
0,My question is: Alani earned $45 for 3 hours o...,0,"[-0.063046746, 0.30057985, 0.14764531, -0.2433..."
1,"Q: Yes / no, is the following a verifiable fac...",0,"[-0.06304664, 0.30057964, 0.14764358, -0.24338..."
2,Read a code snippet and detect if there are an...,0,"[-0.063046254, 0.300581, 0.14764485, -0.243384..."
3,"As a highly advanced chatbot, it is your duty ...",1,"[-0.06304675, 0.3005797, 0.14764543, -0.243385..."
4,Tell me the main idea of this paragraph.,0,"[-0.063047566, 0.3005763, 0.14764251, -0.24338..."


Prepare loaded dataset for metrics counting

In [8]:
X_test = pd.DataFrame(np.array(test_df['embedding'].to_list()))
y_test = test_df["label"]

In [9]:
X_train = pd.DataFrame(np.array(train_df['embedding'].to_list()))
y_train = train_df["label"]

In [10]:
# Check number of training and testing samples
print(f"#Training Samples: {len(X_train)}")
print(f"#Testing Samples: {len(X_test)}")

#Training Samples: 261738
#Testing Samples: 65416


In [11]:
# Import classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

# Initialize estimators using their default parameters
estimators = [
    ("Naive Bayes", GaussianNB()),
    ("LGB Classifier", lgb.LGBMClassifier()),
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)),
    ("MLPClassifier", MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42)),
]

In [12]:
# Import performance metrics libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm  # Import tqdm for progress bar

# Prepare a DataFrame to keep track of the models' performance
results = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1 score"])

# Iterate through each estimator in the list
for est_name, est_obj in tqdm(estimators, desc="Training models"):

    if hasattr(est_obj, 'to'):
        est_obj.to(device)  # Move model to GPU if supported
    
    # Fit the model
    est_obj.fit(X_train, y_train)
    
    # Use the model to predict unseen prompts
    y_predict = est_obj.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)  
    
    # Store performance metrics
    results.loc[est_name] = [accuracy, precision, recall, f1]

Training models:  25%|██▌       | 1/4 [00:01<00:03,  1.24s/it]

[LightGBM] [Info] Number of positive: 129232, number of negative: 132506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.223312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195775
[LightGBM] [Info] Number of data points in the train set: 261738, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493746 -> initscore=-0.025019
[LightGBM] [Info] Start training from score -0.025019


Training models: 100%|██████████| 4/4 [10:31<00:00, 157.90s/it]


In [13]:
results

Unnamed: 0,accuracy,precision,recall,f1 score
Naive Bayes,0.563624,0.771594,0.16486,0.271674
LGB Classifier,0.779152,0.775919,0.777048,0.776483
Random Forest,0.657607,0.70526,0.526445,0.602872
MLPClassifier,0.569906,0.813319,0.167152,0.277311
