#### LLM Full Finetuning (flan-t5-small) -> Mental Health Status Classification 

In [1]:
!pip install pandas==2.2.2
!pip install datasets==2.20.0
!pip install transformers
!pip install  scikit-learn==1.5.1
!pip install  tqdm==4.66.4
!pip install torch==2.4.0
!pip install evaluate==0.4.2
!pip install nltk==3.8.1
!pip install tensorboardX==2.6.2.2
!pip install tqdm
!pip install openpyxl

Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas==2.2.2)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas==2.2.2)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m394.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.0/508.0 kB[0m [31m242.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.6/346.6 kB[0m [31m297.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling 

In [3]:
!pip install tqdm

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from typing import List, Tuple

import evaluate
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfFolder
from nltk.tokenize import sent_tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoModelForSequenceClassification
)
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import torch
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score,f1_score,recall_score, precision_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rtyagi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Load Model

In [7]:
MODEL_ID = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
REPOSITORY_ID = "flan-t5-small-mental-health"

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [11]:
def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """Preprocess the dataset."""

    # add prefix to the input for t5
    inputs = [item for item in sample["statement"]]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["status"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(la if la != tokenizer.pad_token_id else -100) for la in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def postprocess_text(
    preds: List[str], labels: List[str]
) -> Tuple[List[str], List[str]]:
    """helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    accuracy=accuracy_score(decoded_preds,decoded_labels)
    f1score=f1_score(decoded_preds,decoded_labels, average="macro")
    recall=recall_score(decoded_preds,decoded_labels, average="macro")
    precison=precision_score(decoded_preds,decoded_labels, average="macro")
    return {"Accuracy":accuracy,"F1 Score": f1score,"recall":recall,"precison":precison}

In [30]:
label2id = {"Normal": 0, "Depression": 1, "Suicidal": 2,"Bipolar": 3,"Stress":4,"Personality disorder":4}
id2label = {id: label for label, id in label2id.items()}

def load_dataset(model_type: str = "") -> Dataset:
    """Load dataset."""
    dataset_intents_pandas = pd.read_csv(
        "mental_status_train.csv",
        usecols=["statement","status"]
    )
    dataset_intents_pandas = dataset_intents_pandas.sample(6000)
    dataset_intents_pandas["status"] = dataset_intents_pandas["status"].astype(str)
    if model_type == "AutoModelForSequenceClassification":
        # Convert labels to integers
        dataset_intents_pandas["status"] = dataset_intents_pandas["status"].map(
            label2idprecison
        )

    dataset_intents_pandas["statement"] = dataset_intents_pandas["statement"].astype(str)
    dataset = Dataset.from_pandas(dataset_intents_pandas)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.train_test_split(test_size=0.2)
    dataset["validation"] = dataset["train"].train_test_split(test_size=0.1)["test"]
    dataset["train"] = dataset["train"].train_test_split(test_size=0.1)["train"]

    return dataset

dataset = load_dataset()

In [13]:
# Combine Data to check calculate max token length for source(utterance) and target(intent)

In [33]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]]).map(
    lambda x: tokenizer(x["statement"], truncation=True),
    batched=True,
    remove_columns=["statement", "status"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]]).map(
    lambda x: tokenizer(x["status"], truncation=True),
    batched=True,
    remove_columns=["statement", "status"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=20,
    save_total_limit = 2,
    save_strategy = "no",
    logging_dir=f"{REPOSITORY_ID}/logs",  # logging & evaluation strategies
    logging_strategy="epoch",
    load_best_model_at_end=False,
    eval_strategy="epoch",
    report_to="tensorboard"
)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Max target length: 5


In [34]:
# Tokenize Dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["statement", "status"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# load model from the hub
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8,
    )

# Create Trainer instance
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics
)



Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [35]:
# Train Model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Accuracy,F1 score,Recall,Precison
1,0.4377,0.201576,0.789583,0.781293,0.814238,0.786676
2,0.2189,0.160857,0.835417,0.823694,0.853622,0.82887
3,0.1623,0.092118,0.914583,0.914258,0.916557,0.914125
4,0.1198,0.068945,0.947917,0.946315,0.948014,0.945807
5,0.0871,0.063138,0.95,0.949642,0.950825,0.949261
6,0.0668,0.065437,0.960417,0.96,0.961649,0.960523
7,0.0474,0.069778,0.972917,0.973578,0.97427,0.973683
8,0.0332,0.074411,0.96875,0.969345,0.969781,0.969657
9,0.0277,0.080361,0.972917,0.973501,0.974275,0.973522
10,0.0275,0.098052,0.970833,0.971422,0.972626,0.971588


TrainOutput(global_step=10800, training_loss=0.06894874210710879, metrics={'train_runtime': 1468.3405, 'train_samples_per_second': 58.842, 'train_steps_per_second': 7.355, 'total_flos': 1.60609396064256e+16, 'train_loss': 0.06894874210710879, 'epoch': 20.0})

### Save Model

In [36]:
tokenizer.save_pretrained(REPOSITORY_ID)
trainer.save_model(REPOSITORY_ID)

### Load Finetuned Model

In [15]:
REPOSITORY_ID = r"D:\projects\llm-finetuning\intent-classifier\flan-t5-finetuned-mental-health-classifier"
tokenizer_finetuned = AutoTokenizer.from_pretrained(REPOSITORY_ID)
model_finetuned = AutoModelForSeq2SeqLM.from_pretrained(REPOSITORY_ID)

In [None]:
# Test

In [46]:
def classify(texts_to_classify: str):
    """Classify a batch of texts using the model."""
    inputs = tokenizer_finetuned(texts_to_classify,padding=True,truncation=True,return_tensors="pt")
    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model_finetuned.generate(inputs["input_ids"],attention_mask=inputs["attention_mask"],max_length=150,num_beams=2,early_stopping=True)
    predictions = [
        tokenizer_finetuned.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return predictions

In [47]:
validation = pd.read_csv("mental_health.csv",usecols=["statement","status"]).sample(1000)

In [48]:
validation=validation.dropna()

In [49]:
test_data = Dataset.from_pandas(validation)

### Evaluate

In [50]:
def evaluate():
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    batch_size = 16  # Adjust batch size based GPU capacity
    num_batches = len(test_data) // batch_size + (
        0 if len(test_data) % batch_size == 0 else 1
    )
    progress_bar = tqdm(total=num_batches, desc="Evaluating")

    for i in range(0, len(test_data), batch_size):
        batch_texts = test_data["statement"][i : i + batch_size]
        batch_labels = test_data["status"][i : i + batch_size]

        batch_predictions = classify(batch_texts)

        predictions_list.extend(batch_predictions)
        labels_list.extend([str(label) for label in batch_labels])

        progress_bar.update(1)

    progress_bar.close()
    report = classification_report(labels_list, predictions_list,  labels=['Normal','Anxiety','Personality disorder','Depression','Stress','Suicidal','Bipolar'])

In [51]:
evaluate()

Evaluating:   0%|          | 0/63 [00:00<?, ?it/s]

                      precision    recall  f1-score   support

              Normal       0.96      0.89      0.92       330
             Anxiety       0.78      0.73      0.75        63
Personality disorder       0.44      0.88      0.59        17
          Depression       0.82      0.65      0.73       273
              Stress       0.64      0.84      0.73        49
            Suicidal       0.67      0.78      0.72       201
             Bipolar       0.79      0.82      0.80        60

           micro avg       0.80      0.78      0.79       993
           macro avg       0.73      0.80      0.75       993
        weighted avg       0.82      0.78      0.79       993



In [52]:
classify("Stomach pain Anybody else get horrible stomach/chest pain for days on end when immense stress is in your life?" )

['Stress']