In [1]:
!pip install transformers datasets evaluate accelerate peft

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [2]:
import os
import time
import math

import numpy as np
import pandas as pd

import tqdm

import warnings
warnings.filterwarnings("ignore")

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)

import datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [4]:
import pandas as pd
import numpy as np
df=pd.read_csv("/content/ocd_dataset.csv")
df=df.sample(frac=0.2,random_state=42)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def process(text):
    text=re.sub(text_cleaning_re,' ',str(text).lower()).strip()
    tokens=[]
    for i in text.split():
        if i not in stop_words:
            tokens.append(stemmer.stem(i))
    return ' '.join(tokens)
df['Text']=df['Text'].map(lambda x:process(x))
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Text,OCD
1801,friendship took smart imagin brain pour poison...,1
1190,gasp hasnt done whore,0
1817,want burden peopl constant need reassur keep l...,1
251,happi turn upsid feel doubt anxieti depress id...,1
2505,obsess hardest shake time liter like hell suff...,1
...,...,...
104,agre im catch fuck cold freezind tx burn ga,0
2087,band conduct remind us play song certain way t...,1
599,nice cancel ask man suppos get drunk photocopi...,0
1756,start count thing like went categor start ever...,1


In [5]:
df.rename(columns={'Text': 'clean_text','OCD': 'label'}, inplace=True)
# df['label'] = df['label'].replace({'suicide': 1, 'non-suicide': 0})
# df=df.drop(columns=["Unnamed: 0"],axis=0)
df.head()

Unnamed: 0,clean_text,label
1801,friendship took smart imagin brain pour poison...,1
1190,gasp hasnt done whore,0
1817,want burden peopl constant need reassur keep l...,1
251,happi turn upsid feel doubt anxieti depress id...,1
2505,obsess hardest shake time liter like hell suff...,1


In [None]:
from sklearn.model_selection import train_test_split
X_train = list()
X_test = list()
for label in [0,1]:
    train, test  = train_test_split(df[df.label==label],
                                    train_size=3000,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_eval = pd.concat(X_test).sample(frac=1, random_state=10)
X_train

Unnamed: 0,clean_text,label
87676,outfit daili also let critiqu outfit choic bor...,0
155246,wish one knew care could kill myselfi would ha...,1
173147,one time like year ago compet one first danc c...,0
210789,think aliv 2019sinc 2014 life go downhil everi...,1
107605,hey beauti peopl go see ya go day weep go peac...,1
...,...,...
72824,love girl silli chick fil sauc els could think...,0
206631,anymor parent support 0 motiv almost 1am want ...,1
24800,guy realiz someth u megathiccc call son one po...,0
78521,wish tonight night think stab death hard seria...,1


In [None]:
eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_eval.index)]
x_eval = df[df.index.isin(eval_idx)]
X_test = (x_eval
          .groupby('label', group_keys=False)
          .apply(lambda x: x.sample(n=150,random_state=10, replace=True)))
X_train.reset_index(drop=True, inplace=True)
X_eval.reset_index(drop=True, inplace=True)

In [None]:
from datasets import Dataset
train_data = Dataset.from_pandas(X_train)

val_data = Dataset.from_pandas(X_eval)

In [None]:
MODEL_PATH='microsoft/deberta-v2-xlarge'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, return_dict=True, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Number of trainable parameters
print(print_number_of_trainable_model_parameters(model))

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.78G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

trainable model parameters: 886957058
all model parameters: 886957058
percentage of trainable model parameters: 100.00%


In [None]:
model

In [None]:
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["query_proj", "key_proj","value_proj"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

In [None]:
peft_model = get_peft_model(model,
                            lora_config)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1772546
all model parameters: 888729604
percentage of trainable model parameters: 0.20%


In [None]:
def tokenize_func(data):
    return tokenizer(
            data['clean_text'],
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

In [None]:
train_dataset = datasets.Dataset.from_pandas(X_train)
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)
train_dataset

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6000
})

In [None]:
val_dataset = datasets.Dataset.from_pandas(X_eval)
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)

val_dataset

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 600
})

In [None]:
def metrics(eval_prediction):
    logits, labels = eval_prediction
    pred = np.argmax(logits, axis=1)
    auc_score = roc_auc_score(labels, pred)
    return {"Val-AUC": auc_score}

train_batch_size = 4
eval_batch_size = 4

# Define training Args
peft_training_args = TrainingArguments(
    output_dir='./result-deberta-lora',
    logging_dir='./logs-deberta-lora',
#     auto_find_batch_size=True,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    per_device_eval_batch_size=eval_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    num_train_epochs=2,
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    weight_decay=0.01,
    seed=42,
    fp16=True, # Only use with GPU
    report_to='none'
)

# Define Optimzer
optimizer = AdamW(peft_model.parameters(),
                  lr=1e-4,
                  no_deprecation_warning=True)

# Define Scheduler
n_epochs = peft_training_args.num_train_epochs
total_steps = n_epochs * math.ceil(len(train_dataset) / train_batch_size / 2)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)


# Define Trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset, # Training Data
    eval_dataset=val_dataset, # Evaluation Data
    tokenizer=tokenizer,
    compute_metrics=metrics,
    optimizers=(optimizer,lr_scheduler),
    data_collator=collator
)

print(f"Total Steps: {total_steps}")

peft_model_path="./peft-roberta-lora-local"

Total Steps: 1500


In [None]:
peft_trainer.train()

Step,Training Loss,Validation Loss,Val-auc
100,0.6377,0.505729,0.788333
200,0.5236,0.411409,0.833333
300,0.5112,0.326551,0.871667
400,0.4953,0.244785,0.905
500,0.3708,0.254011,0.911667
600,0.3398,0.220059,0.926667
700,0.3303,0.255974,0.926667
800,0.3545,0.279767,0.915
900,0.3086,0.242428,0.923333
1000,0.3419,0.248397,0.93


KeyboardInterrupt: 

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'
def classify(text):
  inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
  output = model(**inputs)

  prediction = output.logits.argmax(dim=-1).item()
  return prediction
  #print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
  # return id2label[prediction]

In [None]:
texts=X_test['clean_text'].tolist()
y_true=X_test.label.tolist()
y_pred=[]
for text in texts:
  pred=classify(text)
  y_pred.append(pred)

def calculate_accuracy(y_true, y_pred):
    """
    Calculate accuracy given true labels and predicted labels.

    Args:
    - y_true (list): List of true labels
    - y_pred (list): List of predicted labels

    Returns:
    - accuracy (float): Accuracy score
    """
    # Check if the lengths of y_true and y_pred are the same
    if len(y_true) != len(y_pred):
        raise ValueError("Lengths of y_true and y_pred must be the same.")

    # Calculate accuracy
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions

    return accuracy

accuracy = calculate_accuracy(y_true[:50], y_pred[:50])
print("Accuracy:", accuracy)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accuracy: 0.98


In [None]:
inputs = tokenizer(texts[1], truncation=True, padding=True, return_tensors="pt").to(device)
output = model(**inputs)
print(output)
prediction = output.logits.argmax(dim=-1).item()

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.4044, -2.9923]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
texts[1],prediction, X_test.label.tolist()[1]

('movi suggest bore want someth watch', 0, 0)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
print(confusion_matrix(y_true, y_pred))
print(f1_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

[[138  12]
 [  6 144]]
0.9411764705882353
0.94
