In [2]:

# !pip install --upgrade transformers datasets accelerate

In [3]:
import transformers

print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.57.0


In [4]:

import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5Config, MT5ForConditionalGeneration, Trainer, TrainingArguments



In [5]:
df = pd.read_csv("health_data_cleaned.csv")


In [6]:
print(f"Original dataset size: {len(df)}")

Original dataset size: 112165


In [7]:
df.dropna(subset=['input', 'output'], inplace=True)

In [8]:
df = df[df['input'].str.strip() != '']
df = df[df['output'].str.strip() != '']

print(f"Cleaned dataset size: {len(df)}")

Cleaned dataset size: 112156


In [9]:

import torch
import math
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)


In [10]:
from sklearn.model_selection import train_test_split


prefix = "converse: "


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the pandas DataFrames into Hugging Face Dataset objects.
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)



In [11]:
# --- 3. Initialize Tokenizer and Model ---
# For English-to-English tasks, the T5 model is  chosen.
# We use 't5-small' for a balance of performance and speed.
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [12]:
# --- Preprocessing and Tokenization ---
# This function prepares your text for the T5 model.
def preprocess_function(examples):
    # Add the prefix to the source texts (your 'input' column).
    inputs = [prefix + text for text in examples["input"]]

    # Tokenize the source texts.
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize the target texts (your 'output' column) to create the labels.
    labels = tokenizer(text_target=examples["output"], max_length=128, truncation=True, padding="max_length")

    # The labels are the input_ids from the tokenized target texts.
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [13]:

# Apply the tokenization function to both the training and validation datasets.
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Remove the original text columns as they are no longer needed for training.
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['input', 'output', '__index_level_0__'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['input', 'output', '__index_level_0__'])




Map: 100%|██████████| 89724/89724 [01:28<00:00, 1014.10 examples/s]
Map: 100%|██████████| 22432/22432 [00:21<00:00, 1062.51 examples/s]


In [14]:
# --- 5. Fine-Tuning the Model ---
# The DataCollator creates batches and handles dynamic padding.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)



In [15]:
# Define the training arguments. These control the fine-tuning process.
training_args = TrainingArguments(
    output_dir="./results_doctor_patient", # Directory to save checkpoints

    learning_rate=2e-5,               # The learning rate for the optimizer
    per_device_train_batch_size=4,    # Batch size for training
    per_device_eval_batch_size=4,     # Batch size for evaluation
    weight_decay=0.01,                # Regularization to prevent overfitting
    save_total_limit=3,               # Only keep the best 3 model checkpoints
    num_train_epochs=10,              # Number of times to iterate over the training data

    fp16=torch.cuda.is_available(),   # Use mixed-precision training if a GPU is available for speed
)



In [None]:
# Initialize the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


print("Starting vanilla fine-tuning...")
trainer.train()
print("Fine-tuning complete! ✨")


  trainer = Trainer(


Starting vanilla fine-tuning...




Step,Training Loss


In [None]:

#  Perplexity After Fine-Tuning------
print("\nCalculating perplexity on the validation set...")
eval_results = trainer.evaluate()

# Returns the evaluation loss in the results.

eval_loss = eval_results['eval_loss']

# Perplexity is the exponential of the cross-entropy loss.
# PPL = e^(loss)

perplexity = math.exp(eval_loss)

print(f"\nPerplexity after fine-tuning: {perplexity:.4f}")


Calculating perplexity on the validation set...



Perplexity after fine-tuning: 18.7910


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer


model.save_pretrained("/content/mt5_finetuned")
tokenizer.save_pretrained("/content/mt5_finetuned")

print("✅ Model saved successfully at /content/mt5_finetuned")


✅ Model saved successfully at /content/mt5_finetuned


In [None]:
from google.colab import files
!zip -r mt5_finetuned.zip /content/mt5_finetuned
files.download("mt5_finetuned.zip")


  adding: content/mt5_finetuned/ (stored 0%)
  adding: content/mt5_finetuned/spiece.model (deflated 48%)
  adding: content/mt5_finetuned/tokenizer_config.json (deflated 94%)
  adding: content/mt5_finetuned/generation_config.json (deflated 27%)
  adding: content/mt5_finetuned/special_tokens_map.json (deflated 85%)
  adding: content/mt5_finetuned/added_tokens.json (deflated 83%)
  adding: content/mt5_finetuned/config.json (deflated 63%)
  adding: content/mt5_finetuned/model.safetensors (deflated 8%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install huggingface_hub

from huggingface_hub import login
login()

# Push model
model.push_to_hub("suryakantmani/mt5-finetuned-health")
tokenizer.push_to_hub("suryakantmani/mt5-finetuned-health")

print("🚀 Model uploaded to Hugging Face Hub successfully!")




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ffywjed/model.safetensors:   0%|          |  552kB /  242MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  .../tmph_zo57gb/spiece.model: 100%|##########|  792kB /  792kB            

🚀 Model uploaded to Hugging Face Hub successfully!
