In [5]:

!pip install --upgrade transformers datasets accelerate

Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Downloading datasets-4.2.0-py3-none-any.whl (506 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.13.0
    Uninstalling datasets-2.13.0:
      Successfully uninstalled datasets-2.13.0
Successfully installed datasets-4.2.0


In [6]:
import transformers

print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.57.0


In [7]:

import pandas as pd
from datasets import Dataset
from transformers import MT5Tokenizer, MT5Config, MT5ForConditionalGeneration, Trainer, TrainingArguments



In [10]:
df = pd.read_csv("health_data_cleaned.csv")


In [11]:
print(f"Original dataset size: {len(df)}")

Original dataset size: 16350


In [12]:
df.dropna(subset=['input', 'output'], inplace=True)

In [13]:
df = df[df['input'].str.strip() != '']
df = df[df['output'].str.strip() != '']

print(f"Cleaned dataset size: {len(df)}")

Cleaned dataset size: 16348


In [14]:

import torch
import math
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)


In [15]:
from sklearn.model_selection import train_test_split


prefix = "converse: "


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the pandas DataFrames into Hugging Face Dataset objects.
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)



In [16]:
# --- 3. Initialize Tokenizer and Model ---
# For English-to-English tasks, the T5 model is  chosen.
# We use 't5-small' for a balance of performance and speed.
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
# --- Preprocessing and Tokenization ---
# This function prepares your text for the T5 model.
def preprocess_function(examples):
    # Add the prefix to the source texts (your 'input' column).
    inputs = [prefix + text for text in examples["input"]]

    # Tokenize the source texts.
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize the target texts (your 'output' column) to create the labels.
    labels = tokenizer(text_target=examples["output"], max_length=128, truncation=True, padding="max_length")

    # The labels are the input_ids from the tokenized target texts.
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [19]:

# Apply the tokenization function to both the training and validation datasets.
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Remove the original text columns as they are no longer needed for training.
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['input', 'output', '__index_level_0__'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['input', 'output', '__index_level_0__'])




Map:   0%|          | 0/13078 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [20]:
# --- 5. Fine-Tuning the Model ---
# The DataCollator creates batches and handles dynamic padding.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)



In [23]:
# Define the training arguments. These control the fine-tuning process.
training_args = TrainingArguments(
    output_dir="./results_doctor_patient", # Directory to save checkpoints

    learning_rate=2e-5,               # The learning rate for the optimizer
    per_device_train_batch_size=4,    # Batch size for training
    per_device_eval_batch_size=4,     # Batch size for evaluation
    weight_decay=0.01,                # Regularization to prevent overfitting
    save_total_limit=3,               # Only keep the best 3 model checkpoints
    num_train_epochs=10,              # Number of times to iterate over the training data

    fp16=torch.cuda.is_available(),   # Use mixed-precision training if a GPU is available for speed
)



In [24]:
# Initialize the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


print("Starting vanilla fine-tuning...")
trainer.train()
print("Fine-tuning complete! ✨")


  trainer = Trainer(


Starting vanilla fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msuryakantmani[0m ([33mnaman98[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,4.5601
1000,3.7792
1500,3.6526
2000,3.5652
2500,3.5019
3000,3.4739
3500,3.466
4000,3.4046
4500,3.3951
5000,3.3909


Fine-tuning complete! ✨


In [25]:

#  Perplexity After Fine-Tuning------
print("\nCalculating perplexity on the validation set...")
eval_results = trainer.evaluate()

# Returns the evaluation loss in the results.

eval_loss = eval_results['eval_loss']

# Perplexity is the exponential of the cross-entropy loss.
# PPL = e^(loss)

perplexity = math.exp(eval_loss)

print(f"\nPerplexity after fine-tuning: {perplexity:.4f}")


Calculating perplexity on the validation set...



Perplexity after fine-tuning: 18.7910


In [26]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer


model.save_pretrained("/content/mt5_finetuned")
tokenizer.save_pretrained("/content/mt5_finetuned")

print("✅ Model saved successfully at /content/mt5_finetuned")


✅ Model saved successfully at /content/mt5_finetuned


In [27]:
from google.colab import files
!zip -r mt5_finetuned.zip /content/mt5_finetuned
files.download("mt5_finetuned.zip")


  adding: content/mt5_finetuned/ (stored 0%)
  adding: content/mt5_finetuned/spiece.model (deflated 48%)
  adding: content/mt5_finetuned/tokenizer_config.json (deflated 94%)
  adding: content/mt5_finetuned/generation_config.json (deflated 27%)
  adding: content/mt5_finetuned/special_tokens_map.json (deflated 85%)
  adding: content/mt5_finetuned/added_tokens.json (deflated 83%)
  adding: content/mt5_finetuned/config.json (deflated 63%)
  adding: content/mt5_finetuned/model.safetensors (deflated 8%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
!pip install huggingface_hub

from huggingface_hub import login
login()

# Push model
model.push_to_hub("suryakantmani/mt5-finetuned-health")
tokenizer.push_to_hub("suryakantmani/mt5-finetuned-health")

print("🚀 Model uploaded to Hugging Face Hub successfully!")




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ffywjed/model.safetensors:   0%|          |  552kB /  242MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  .../tmph_zo57gb/spiece.model: 100%|##########|  792kB /  792kB            

🚀 Model uploaded to Hugging Face Hub successfully!
