In [1]:
from transformers import (
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling, GPT2Tokenizer, GPT2LMHeadModel,
)
from sklearn.model_selection import train_test_split
import pandas as pd




In [2]:
model_name = "gpt2"
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the pad_token for the tokenizer
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

In [3]:
df = pd.read_parquet("hf://datasets/heliosbrahma/mental_health_chatbot_dataset/data/train-00000-of-00001-01391a60ef5c00d9.parquet")

In [4]:
df.head()

Unnamed: 0,text
0,<HUMAN>: What is a panic attack?\n<ASSISTANT>:...
1,<HUMAN>: What are symptoms of panic attack vs....
2,<HUMAN>: What are the types of Mental Illness?...
3,<HUMAN>: What does mental-illness mean?\n<ASSI...
4,<HUMAN>: How can you treat mental illness?\n<A...


In [5]:
df[['question', 'answer']] = df['text'].str.split('<ASSISTANT>:', n=1, expand=True)

df['question'] = df['question'].str.replace('<HUMAN>:', '').str.strip()
df['answer'] = df['answer'].str.strip()

df.drop('text', axis=1, inplace=True)

df.head()

Unnamed: 0,question,answer
0,What is a panic attack?,Panic attacks come on suddenly and involve int...
1,What are symptoms of panic attack vs. anxiety ...,Panic attacks and anxiety attacks can share so...
2,What are the types of Mental Illness?,There are many different conditions that are r...
3,What does mental-illness mean?,Mental illness is a group of biological brain ...
4,How can you treat mental illness?,The treatment of mental illness is a multi-fac...


In [6]:
train_df, val_df = train_test_split(df, test_size=0.2)

In [7]:
text = 'checking tokenizer if it works'
inputs = gpt2_tokenizer(text)
inputs

{'input_ids': [41004, 11241, 7509, 611, 340, 2499], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [8]:
tokens = gpt2_tokenizer.convert_ids_to_tokens(inputs['input_ids'])
tokens

['checking', 'Ġtoken', 'izer', 'Ġif', 'Ġit', 'Ġworks']

In [9]:
print(f"Training Size: {train_df.shape}")
print(f"Validation Size: {val_df.shape}")

Training Size: (137, 2)
Validation Size: (35, 2)


In [10]:
def preprocess_data(df, tokenizer):
    # Combine question and answer into a single format and tokenize.
    texts = [
        f"User: {row['question']}\nAI: {row['answer']}\n"
        for _, row in df.iterrows()
    ]
    return tokenizer(
        texts, padding=True, truncation=True, max_length=1024, return_tensors="pt"
    )

In [11]:
train_encodings = preprocess_data(train_df, gpt2_tokenizer)
val_encodings = preprocess_data(val_df, gpt2_tokenizer)

In [12]:
decoded_example = gpt2_tokenizer.decode(train_encodings['input_ids'][0], skip_special_tokens=True)
print(decoded_example)

User: What Is Post-Traumatic Stress Disorder?
AI: Post-Traumatic Stress Disorder (PTSD) is an anxiety disorder that can develop after exposure to a terrifying event or ordeal in which grave physical harm occurred or was threatened. After traumatic events, such as death, an earthquake, war, car accidents, floods or fires, it is not uncommon for people to experience feelings of heightened fear, worry, sadness or anger. If the emotions persist, however, or become severe, or the person gets triggered into reliving the event in their daily life, this can affect the person’s ability to function and may be a sign of PTSD.



In [13]:
from torch.utils.data import Dataset

class MentalHealthDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}

train_dataset = MentalHealthDataset(train_encodings)
val_dataset = MentalHealthDataset(val_encodings)

In [14]:
# Set up dynamic paddings
data_collator = DataCollatorForLanguageModeling(
    tokenizer=gpt2_tokenizer, mlm=False
)

In [15]:
# Initialize training parameters
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    learning_rate=3e-5,
    warmup_ratio=0.05,
    weight_decay=0.01,
    report_to="none",
    load_best_model_at_end=True, 
    metric_for_best_model="loss", 
    greater_is_better=False, 
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [16]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.325,2.294621
2,2.1054,2.212801
3,1.8556,2.198152


TrainOutput(global_step=54, training_loss=2.3346034509164317, metrics={'train_runtime': 4160.7327, 'train_samples_per_second': 0.099, 'train_steps_per_second': 0.013, 'total_flos': 109488506112000.0, 'train_loss': 2.3346034509164317, 'epoch': 3.0})

In [17]:
# Save the model
trainer.save_model('./fine_tuned_model')

In [18]:
import math

# Evaluate the model
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])

print(f"Evaluation_Results: {eval_results}")
print(f"Perplexity: {perplexity}")

Evaluation_Results: {'eval_loss': 2.1981518268585205, 'eval_runtime': 31.4019, 'eval_samples_per_second': 1.115, 'eval_steps_per_second': 0.159, 'epoch': 3.0}
Perplexity: 9.008349115959394
