In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/depression-reddit-cleaned/depression_dataset_reddit_cleaned.csv


In [2]:
df=pd.read_csv('/kaggle/input/depression-reddit-cleaned/depression_dataset_reddit_cleaned.csv')

In [None]:
!pip install -q -U torch=='1.4.0'
!pip install -q -U accelerate=='0.25.0' peft=='0.7.1' bitsandbytes=='0.41.3.post2' trl=='0.7.4'
!pip install -q -U transformers einops

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install transformers==4.34.0

In [4]:
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.model_selection import train_test_split
X_train = list()
X_test = list()
for label in [0,1]:
    train, test  = train_test_split(df[df.is_depression==label], 
                                    train_size=750,
                                    test_size=500, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)
X_train

Unnamed: 0,clean_text,is_depression
4057,tuesday ll start with reflection n then a lect...,0
6620,keongzai assign someone else to do it or eff i...,0
3214,giving them depression and anxiety cutting the...,1
3742,perruchee peteblacklab her dog had a phantom p...,1
883,so tired i struggle to wake up in the morning ...,1
...,...,...
2148,so i started having suicidal thought in april ...,1
3705,boningwigald so sieht meine momentane depressi...,1
6786,back to bed for me,0
2463,therapy meditation working out changed my diet...,1


In [6]:
eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('is_depression', group_keys=False)
          .apply(lambda x: x.sample(n=150,random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [7]:
def generate_test_prompt(data_point):
    return f"""
            analyse the following sentence in square brackets and determine if it has depression or not, return the answer as 0 for no deression and 1 for depression.

            [{data_point["clean_text"]}] = 
            """.strip()
y_true=X_test.is_depression
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["clean_text"])

def generate_prompt(data_point):
    return f"""
            analyse the following sentence in square brackets and determine if it has depression or not, return the answer as 0 for no deression and 1 for depression.

            [{data_point["clean_text"]}] = {data_point["is_depression"]}
            """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["clean_text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["clean_text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)


In [57]:
X_test

Unnamed: 0,clean_text
4670,analyse the following sentence in square brack...
5548,analyse the following sentence in square brack...
4152,analyse the following sentence in square brack...
7018,analyse the following sentence in square brack...
6100,analyse the following sentence in square brack...
...,...
551,analyse the following sentence in square brack...
3337,analyse the following sentence in square brack...
964,analyse the following sentence in square brack...
2807,analyse the following sentence in square brack...


In [9]:
model_name = "microsoft/phi-2"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install transformers==4.37.0

In [65]:
import tqdm 
def predict(test, model, tokenizer):
    y_pred = []
    for i in range(test.shape[0]):
        prompt = test.iloc[i]["clean_text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 2 
                       )
        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
        answer = result[0]['generated_text'].split("=")[-1].strip()
        if answer=='1':
            y_pred.append(1)
        elif answer=='0':
            y_pred.append(0)
        else: y_pred.append(-1)
    return y_pred

In [11]:
import re

def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [24]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['q_proj','k_proj','v_proj','dense'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="clean_text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=512,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
trainer

<trl.trainer.sft_trainer.SFTTrainer at 0x7ef9eae23f10>

In [25]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")

Epoch,Training Loss,Validation Loss
0,2.0067,2.133541
1,2.382,2.094978


In [27]:
def evaluate(y_true, y_pred):
    
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [32]:
prompt = X_test.iloc[0]["clean_text"]
pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer,
                        max_new_tokens = 3, 
                        temperature = 0.0,
                )
result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
answer = result[0]['generated_text'].split("=")[-1].strip()
answer

'0'

In [66]:
y_pred=predict(X_test,model,tokenizer)

In [67]:
y_true=list(y_true)
evaluate(y_true,y_pred)

Accuracy: 0.568
Accuracy for label 0: 0.978
Accuracy for label 1: 0.158

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.93      0.98      0.95       500
           1       0.93      0.16      0.27       500

    accuracy                           0.57      1000
   macro avg       0.62      0.38      0.41      1000
weighted avg       0.93      0.57      0.61      1000


Confusion Matrix:
[[489   6]
 [ 36  79]]


In [62]:
X_test.shape[0]

1000