In [None]:
!pip install transformers datasets evaluate accelerate peft

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)


In [None]:
import os
import time
import math

import numpy as np
import pandas as pd

import tqdm

import warnings
warnings.filterwarnings("ignore")

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)

import datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Suicide_Detection.csv")
df=df.sample(frac=0.5,random_state=42)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def process(text):
    text=re.sub(text_cleaning_re,' ',str(text).lower()).strip()
    tokens=[]
    for i in text.split():
        if i not in stop_words:
            tokens.append(stemmer.stem(i))
    return ' '.join(tokens)
df['text']=df['text'].map(lambda x:process(x))
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0.1,Unnamed: 0,text,class
74414,111734,know 7 month self harm free urg get stronger s...,suicide
149516,224358,start becom rich start compani becom 16 afford...,non-suicide
12484,18790,poem haiku u game dev hi hello hello stop fuck...,non-suicide
14043,21196,honest got idea anymor feel everyon fake feel ...,suicide
30673,46089,ever cri like think unfair life cri cri ever s...,non-suicide
...,...,...,...
1930,2885,got gcse result beyond reliv excit ho collag i...,non-suicide
73581,110460,one life understand hard keep live everydayeve...,suicide
114216,171646,told mom vaccuum room need play new cs team de...,non-suicide
54676,81994,methodsso tri rope wood ranger found tri inhal...,suicide


In [None]:
df.rename(columns={'text': 'clean_text','class': 'label'}, inplace=True)

In [None]:
df.shape

(116037, 3)

In [None]:
df['label'] = df['label'].replace({'suicide': 1, 'non-suicide': 0})
df=df.drop(columns=["Unnamed: 0"],axis=0)

In [None]:
from sklearn.model_selection import train_test_split
X_train = list()
X_test = list()
for label in [0,1]:
    train, test  = train_test_split(df[df.label==label],
                                    train_size=3000,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_eval = pd.concat(X_test).sample(frac=1, random_state=10)
X_train

Unnamed: 0,clean_text,label
93404,gender pant must mom good day school tomorrow ...,0
212702,give uppeopl fuck suck one legitim one honest ...,1
109098,love get 5 ever reason love go court,0
82213,got offer great job suicidaldespit effort self...,1
68599,updat plan suicid ideat need advic pleas hi fo...,1
...,...,...
103802,said ask cashier short 5 cent matter,0
119000,today almost break pointi feel like hang threa...,1
184854,guy make first move l make first move got crus...,0
1474,hmmjump death gunshot drug pain,1


In [None]:
eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_eval.index)]
x_eval = df[df.index.isin(eval_idx)]
X_test = (x_eval
          .groupby('label', group_keys=False)
          .apply(lambda x: x.sample(n=150,random_state=10, replace=True)))
X_train.reset_index(drop=True, inplace=True)
X_eval.reset_index(drop=True, inplace=True)

In [None]:
X_test

Unnamed: 0,clean_text,label
115486,biggest fear biggest fear may pressur drag f e...,0
38323,anyon wanna pm bit mega bore still sleep feel ...,0
200189,someth friend teenag would appreci much littl ...,0
51405,ayo like month time check lt 33 go everyth alr...,0
81925,award bad said pleas give lifetim suppli reddi...,0
...,...,...
108395,empti insidetoday remind fat anyth right life ...,1
149851,never felt wors life want give upif ask felt j...,1
140174,want kill myselfi tire live like anymor,1
222514,know call suicid prevent center tell help spec...,1


In [None]:
from datasets import Dataset
train_data = Dataset.from_pandas(X_train)

val_data = Dataset.from_pandas(X_eval)

In [None]:
val_data.features['label']

Value(dtype='int64', id=None)

In [None]:
MODEL_PATH='facebook/bart-large'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, return_dict=True, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Number of trainable parameters
print(print_number_of_trainable_model_parameters(model))

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

trainable model parameters: 407343106
all model parameters: 407343106
percentage of trainable model parameters: 100.00%


In [None]:
model

BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): L

In [None]:
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["q_proj", "v_proj","k_proj"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

In [None]:
peft_model = get_peft_model(model,
                            lora_config)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1769472
all model parameters: 409112578
percentage of trainable model parameters: 0.43%


In [None]:
def tokenize_func(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True)

In [None]:
train_dataset = datasets.Dataset.from_pandas(X_train)
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)
train_dataset

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 6000
})

In [None]:
val_dataset = datasets.Dataset.from_pandas(X_eval)
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["clean_text"]
)

val_dataset

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 600
})

In [None]:
from datasets import load_metric


def compute_metrics(pred):
    metric = load_metric("accuracy")

    labels = pred.label_ids
    if isinstance(pred.predictions, tuple):  # Check if predictions is a tuple
        logits = pred.predictions[0]  # Assuming logits are the first element of the tuple
    else:
        logits = pred.predictions

    preds = np.argmax(logits, axis=-1)
    results = metric.compute(predictions=preds, references=labels)
    return {"accuracy": results["accuracy"]}

train_batch_size = 8
eval_batch_size = 8

# Define training Args
peft_training_args = TrainingArguments(
    output_dir='./result-bart-lora',
    logging_dir='./logs-bart-lora',
#     auto_find_batch_size=True,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    per_device_eval_batch_size=eval_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    num_train_epochs=1,
    logging_steps=200,
    evaluation_strategy='steps',
    eval_steps=200,
    weight_decay=0.01,
    seed=42,
    fp16=True, # Only use with GPU
    report_to='none'
)

# Define Optimzer
optimizer = AdamW(peft_model.parameters(),
                  lr=1e-4,
                  no_deprecation_warning=True)

# Define Scheduler
n_epochs = peft_training_args.num_train_epochs
total_steps = n_epochs * math.ceil(len(train_dataset) / train_batch_size / 2)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)


# Define Trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset, # Training Data
    eval_dataset=val_dataset, # Evaluation Data
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer,lr_scheduler),
    data_collator=collator
)

print(f"Total Steps: {total_steps}")

peft_model_path="./peft-bart-lora-local"

Total Steps: 375


In [None]:
peft_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
200,0.4228,0.368529,0.841667
400,0.4144,0.368529,0.841667
600,0.41,0.368529,0.841667


KeyboardInterrupt: 

In [None]:
texts=X_test['clean_text'].tolist()
y_true=X_test.label.tolist()

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'
def classify(text):
  inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
  output = model(**inputs)
  print(output.logits.argmax(dim=-1))
  prediction = output.logits.argmax(dim=-1).item()
  return prediction
  #print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
  # return id2label[prediction]

In [None]:
texts[100],classify(texts[100]),y_true[100]

tensor([1], device='cuda:0')


('problem parent make set age allow kid date make kid feel like start date age need relationship let kid figur thing grow good bad experi relationship learn patient less desper oblig find someon',
 1,
 0)

In [None]:

y_pred=[]
for text in texts:
  pred=classify(text)
  y_pred.append(pred)

def calculate_accuracy(y_true, y_pred):
    """
    Calculate accuracy given true labels and predicted labels.

    Args:
    - y_true (list): List of true labels
    - y_pred (list): List of predicted labels

    Returns:
    - accuracy (float): Accuracy score
    """
    # Check if the lengths of y_true and y_pred are the same
    if len(y_true) != len(y_pred):
        raise ValueError("Lengths of y_true and y_pred must be the same.")

    # Calculate accuracy
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions

    return accuracy

accuracy = calculate_accuracy(y_true, y_pred)
print("Accuracy:", accuracy)

In [None]:
inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
output = model(**inputs)
prediction = output.logits.argmax(dim=-1).item()

In [None]:
prediction, X_test.label.tolist()[0]

(0, 0)

In [None]:
from sklearn.metrics import f1_score
f1=f1_score(y_true,y_pred)
f1,accuracy

(0.8413793103448276, 0.8466666666666667)