In [1]:
!pip install transformers datasets evaluate accelerate peft

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [2]:
import os
import time
import math

import numpy as np
import pandas as pd

import tqdm

import warnings
warnings.filterwarnings("ignore")

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)

import datasets

In [3]:
# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [4]:
import pandas as pd
import numpy as np
df=pd.read_csv("/content/ocd_dataset.csv")
df=df.sample(frac=1,random_state=42)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def process(text):
    text=re.sub(text_cleaning_re,' ',str(text).lower()).strip()
    tokens=[]
    for i in text.split():
        if i not in stop_words:
            tokens.append(stemmer.stem(i))
    return ' '.join(tokens)
df['Text']=df['Text'].map(lambda x:process(x))
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Text,OCD
1801,friendship took smart imagin brain pour poison...,1
1190,gasp hasnt done whore,0
1817,want burden peopl constant need reassur keep l...,1
251,happi turn upsid feel doubt anxieti depress id...,1
2505,obsess hardest shake time liter like hell suff...,1
...,...,...
1638,musician,0
1095,naah,0
1130,biolog mother right,0
1294,social physic littl less matur peer felt signi...,1


In [5]:
df.rename(columns={'Text': 'clean_text','OCD': 'label'}, inplace=True)
# df['label'] = df['label'].replace({'suicide': 1, 'non-suicide': 0})
# df=df.drop(columns=["Unnamed: 0"],axis=0)
df.head()

Unnamed: 0,clean_text,label
1801,friendship took smart imagin brain pour poison...,1
1190,gasp hasnt done whore,0
1817,want burden peopl constant need reassur keep l...,1
251,happi turn upsid feel doubt anxieti depress id...,1
2505,obsess hardest shake time liter like hell suff...,1


In [6]:
from sklearn.model_selection import train_test_split
X_train = list()
X_test = list()
for label in [0,1]:
    train, test  = train_test_split(df[df.label==label],
                                    train_size=1400,
                                    test_size=50,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_eval = pd.concat(X_test).sample(frac=1, random_state=10)
X_train

Unnamed: 0,clean_text,label
2894,suck bean give zest life bite beauti fruit eve...,0
2730,fallopian tube,0
1784,believ peopl work slowli communic bad good thi...,1
2073,anxious would spiral depress one lowest point ...,1
21,suck van mechan know owe favor also thanke sai...,0
...,...,...
1060,see doctor fear would lock tri reason rational...,1
1914,adangta kan jadi bitch ane enough taa aku mara...,0
71,mall bih,0
456,hahahaha yeahhhhhhhhhhhhhhhhh member makin cup...,0


In [7]:
eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_eval.index)]
x_eval = df[df.index.isin(eval_idx)]
X_test = (x_eval
          .groupby('label', group_keys=False)
          .apply(lambda x: x.sample(n=50,random_state=10, replace=True)))
X_train.reset_index(drop=True, inplace=True)
X_eval.reset_index(drop=True, inplace=True)
X_test = X_test.sample(frac=1,random_state=42)
X_test

Unnamed: 0,clean_text,label
1447,one solut would come final decontamin mind sat...,1
393,rather panic attack check check window close l...,1
347,thought say get back within thirti second whol...,1
219,damn suck miss take gone alreadi,0
989,kid love phantom kind music freak though,0
...,...,...
189,plan next time level head think irrate take 5 ...,1
295,would also wash hand germ afraid would get fat...,1
1664,nah read spook sat door im pretti sure someon ...,0
454,context look ocd goggl ration goggl apart remi...,1


In [8]:
from datasets import Dataset
train_data = Dataset.from_pandas(X_train)

val_data = Dataset.from_pandas(X_eval)

In [9]:
MODEL_PATH='FacebookAI/roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, return_dict=True, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Number of trainable parameters
print(print_number_of_trainable_model_parameters(model))

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

trainable model parameters: 124647170
all model parameters: 124647170
percentage of trainable model parameters: 100.00%


In [10]:
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["query", "key","value"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

In [11]:
peft_model = get_peft_model(model,
                            lora_config)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1034498
all model parameters: 125681668
percentage of trainable model parameters: 0.82%


In [12]:
def tokenize_func(data):
    return tokenizer(
            data['clean_text'],
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

In [13]:
train_dataset = datasets.Dataset.from_pandas(X_train)
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)
train_dataset

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 2800
})

In [14]:
val_dataset = datasets.Dataset.from_pandas(X_eval)
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)

val_dataset

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [15]:
def metrics(eval_prediction):
    logits, labels = eval_prediction
    pred = np.argmax(logits, axis=1)
    auc_score = roc_auc_score(labels, pred)
    return {"Val-AUC": auc_score}

train_batch_size = 8
eval_batch_size = 8

# Define training Args
peft_training_args = TrainingArguments(
    output_dir='./result-roberta-lora',
    logging_dir='./logs-roberta-lora',
#     auto_find_batch_size=True,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    per_device_eval_batch_size=eval_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    num_train_epochs=2,
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    weight_decay=0.01,
    seed=42,
    fp16=True, # Only use with GPU
    report_to='none'
)

# Define Optimzer
optimizer = AdamW(peft_model.parameters(),
                  lr=1e-4,
                  no_deprecation_warning=True)

# Define Scheduler
n_epochs = peft_training_args.num_train_epochs
total_steps = n_epochs * math.ceil(len(train_dataset) / train_batch_size / 2)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)


# Define Trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset, # Training Data
    eval_dataset=val_dataset, # Evaluation Data
    tokenizer=tokenizer,
    compute_metrics=metrics,
    optimizers=(optimizer,lr_scheduler),
    data_collator=collator
)

print(f"Total Steps: {total_steps}")

peft_model_path="./peft-roberta-lora-local"

Total Steps: 350


In [16]:
peft_trainer.train()

Step,Training Loss,Validation Loss,Val-auc
50,0.6793,0.601294,0.94
100,0.3761,0.131149,0.98
150,0.1427,0.128112,0.97
200,0.1366,0.108067,0.97
250,0.1476,0.120356,0.97
300,0.1421,0.114135,0.97
350,0.1391,0.082598,0.97
400,0.1261,0.08254,0.97
450,0.1045,0.08254,0.97
500,0.1533,0.08254,0.97


TrainOutput(global_step=700, training_loss=0.17933423212596347, metrics={'train_runtime': 220.7108, 'train_samples_per_second': 25.373, 'train_steps_per_second': 3.172, 'total_flos': 1491218586009600.0, 'train_loss': 0.17933423212596347, 'epoch': 2.0})

In [20]:
device='cuda' if torch.cuda.is_available() else 'cpu'
def classify(text):
  inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
  output = model(**inputs)

  prediction = output.logits.argmax(dim=-1).item()
  return prediction
  #print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
  # return id2label[prediction]

In [22]:
texts=X_test['clean_text'].tolist()
y_true=X_test.label.tolist()
y_pred=[]
for text in texts:
  pred=classify(text)
  y_pred.append(pred)

def calculate_accuracy(y_true, y_pred):
    """
    Calculate accuracy given true labels and predicted labels.

    Args:
    - y_true (list): List of true labels
    - y_pred (list): List of predicted labels

    Returns:
    - accuracy (float): Accuracy score
    """
    # Check if the lengths of y_true and y_pred are the same
    if len(y_true) != len(y_pred):
        raise ValueError("Lengths of y_true and y_pred must be the same.")

    # Calculate accuracy
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions

    return accuracy

accuracy = calculate_accuracy(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [None]:
inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
output = model(**inputs)
prediction = output.logits.argmax(dim=-1).item()

In [None]:
prediction, X_test.label.tolist()[0]

(0, 0)

In [23]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
print(confusion_matrix(y_true, y_pred))
print(f1_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

[[50  0]
 [ 0 50]]
1.0
1.0


In [24]:
texts,y_true

(['one solut would come final decontamin mind satisfact spoiler alert never would first step toward treatment came desper mother call',
  'rather panic attack check check window close lock door lock alarm set kind thing normal idea done specif way',
  'thought say get back within thirti second whole famili die illog irrat help mother want know keep run',
  'damn suck miss take gone alreadi',
  'kid love phantom kind music freak though',
  'lol person readi finger ring cock ring would though that good time',
  'awwwww suck sorri pretti good tri chang stuff mi profil iz workin good',
  'never seen similar exampl ocd literatur exampl real life event face valu may regard non ocd issu realli kiss',
  'h e hate hate hate two game lose streak',
  'haha santyna talk bout u damn still got game haha',
  'final took damn long',
  'damn suck miss take gone alreadi',
  'time even number also keep readjust thing 90 degre angl align someth els also keep redo thing know',
  'best friend bitch hell pro