In [1]:
! pip install datasets -q
! pip install evaluate -q
! pip install mlflow -q
! pip install pyngrok -q
! pip install peft -q
! pip install rouge_score -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m389.1/491.2 kB[0m [31m11.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_from_disk, load_dataset, Dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, Text2TextGenerationPipeline, pipeline, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

import os
import torch
import time
import evaluate
from evaluate import load

import pandas as pd
import numpy as np

from tqdm import tqdm
import torch

import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass

from google.colab import userdata
from accelerate import Accelerator

from peft import get_peft_model, LoraConfig, TaskType

## Helpers

In [3]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [4]:
def tokenize_function(example):
    start_prompt = 'Solve the following math problem and only return the answer.\n\n'
    end_prompt = '\n\nYour answer: '
    prompt = [start_prompt + problem + end_prompt for problem in example["Problem"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["Answer"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

In [5]:
def train_models_helper(model, training_args, tokenized_datasets_filtered):
  return Trainer(
      model=peft_model,
      args=training_args,
      train_dataset=tokenized_datasets_filtered['train'],
      eval_dataset=tokenized_datasets_filtered['validation']
      #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
  )

In [6]:
def train_in_mlflow(pipe, training_args, trainer, input_example):
  with mlflow.start_run() as run:
      mlflow.log_params({
          "learning_rate": training_args.learning_rate,
          "num_train_epochs": training_args.num_train_epochs,
          "weight_decay": training_args.weight_decay
      })

      trainer.train()
      metrics = trainer.evaluate()
      mlflow.log_metrics(metrics)

      mlflow.transformers.log_model(
          transformers_model=pipe,
          artifact_path="model-flan-t5-small-finetuned",
          input_example="input_example"
      )

      print("Logged to:", run.info.artifact_uri)
      return f"runs:/{run.info.run_id}/model-flan-t5-small-finetuned"


In [7]:
def inference_with_mlflow(test_set, model, model_artifact_path, include_rouge = False):
  # Extract run ID if using a model URI like "runs:/<run_id>/model"
  run_id = model_artifact_path.split("/")[1]

  # Inference
  with mlflow.start_run(run_id=run_id):
      results = []

      for example in tqdm(test_set, desc="Generating answers", total=len(test_set)):
          question = example['Problem']
          true_answer = example['Answer']

          prompt = f"""Answer the following question:

  {question}

  Answer:"""

          # Model is already a pipeline; no need for tokenizer or .to(device)
          output = model(prompt, max_new_tokens=700)
          generated_answer = output[0]['generated_text'].strip()

          results.append({
              "question": question,
              "answer": true_answer.strip(),
              "generated_answer": generated_answer
          })

      # Convert to DataFrame
      results_df = pd.DataFrame(results)

      # Save and log CSV
      csv_path = "inference_results.csv"
      results_df.to_csv(csv_path, index=False)
      mlflow.log_artifact(csv_path)

      # Accuracy (simple string match)
      correct = (results_df['answer'] == results_df['generated_answer']).sum()
      total = len(results_df)
      accuracy = correct / total
      mlflow.log_metric("inference_accuracy", accuracy)

      print(f"Inference complete. Accuracy: {round(accuracy * 100, 2)}%")

      if include_rouge:
        # Compute and log ROUGE scores
        rouge = load("rouge")
        rouge_result = rouge.compute(
            predictions=results_df["generated_answer"].tolist(),
            references=results_df["answer"].tolist()
        )

        for key, value in rouge_result.items():
            mlflow.log_metric(f"rouge_{key}", value)  # Keep as float (not string %)

        return results_df, accuracy, rouge_result


      return results_df, accuracy

## Datasets

In [8]:
from datasets import load_dataset, Dataset, DatasetDict

# Load the medical dataset
dataset_medical = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")

# Drop the 'Complex_CoT' column
dataset_medical = dataset_medical.remove_columns(['Complex_CoT'])

# Rename column for uniformity with Math Dataset
dataset_medical = dataset_medical.rename_column("Question", "Problem")
dataset_medical = dataset_medical.rename_column("Response", "Answer")

# Split the dataset into train, validation, and test
dataset_split = dataset_medical["train"].train_test_split(test_size=0.1, shuffle=True)
train_val_split = dataset_split["train"].train_test_split(test_size=0.1, shuffle=True)

# Update dataset with the new splits
dataset_medical = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": dataset_split["test"]
})

dataset_medical

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 15959
    })
    validation: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 1971
    })
})

In [9]:
from google.colab import drive
drive.mount('/content/drive')
dataset_math = load_from_disk('/content/drive/MyDrive/dataset')
dataset_math = dataset_math.remove_columns(['Rationale', 'Category'])
dataset_math

Mounted at /content/drive


DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 26257
    })
    validation: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 3282
    })
    test: Dataset({
        features: ['Problem', 'Answer'],
        num_rows: 3283
    })
})

## Define MLFlow and NGROK

In [10]:
# Define the persistent path in your Google Drive
mlflow_drive_path = "/content/drive/MyDrive/mlflow"
os.makedirs(mlflow_drive_path, exist_ok=True)

MLFLOW_TRACKING_URI = f"sqlite:///{mlflow_drive_path}/mlflow.db"

conf.get_default().auth_token = userdata.get('NGROK')

# Start the MLflow server using subprocess
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI, "--port", "5000"])

mlflow_url = ngrok.connect(addr=5000, domain="happily-flowing-pelican.ngrok-free.app")

In [11]:
# Set MLflow tracking URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Set or create an experiment
mlflow.set_experiment("Hugging Face - Flan-T5-Small")

<Experiment: artifact_location='/content/mlruns/1', creation_time=1745614188584, experiment_id='1', last_update_time=1745614188584, lifecycle_stage='active', name='Hugging Face - Flan-T5-Small', tags={}>

## Load Models

### Original

In [29]:
model_name='google/flan-t5-small'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


### Peft

In [30]:
# LoRA config
lora_config = LoraConfig(
    r=8,                         # Low-rank dimension
    lora_alpha=16,               # Scaling factor
    target_modules=["q", "v"],   # Adapt attention projections only
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply PEFT
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 344064
all model parameters: 77305216
percentage of trainable model parameters: 0.45%


## Tokenize

### Math

In [15]:
tokenized_datasets_math = dataset_math.map(tokenize_function, batched=True)

print(tokenized_datasets_math)

tokenized_datasets_math_filtered = tokenized_datasets_math.filter(lambda example, index: index % 5 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets_math_filtered['train'].shape}")
print(f"Validation: {tokenized_datasets_math_filtered['validation'].shape}")
print(f"Test: {tokenized_datasets_math_filtered['test'].shape}")

print(tokenized_datasets_math_filtered)

Map:   0%|          | 0/3283 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 26257
    })
    validation: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 3282
    })
    test: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 3283
    })
})


Filter:   0%|          | 0/3283 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (5252, 4)
Validation: (657, 4)
Test: (657, 4)
DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 5252
    })
    validation: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 657
    })
    test: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 657
    })
})


### Medical

In [16]:
tokenized_datasets_medical = dataset_medical.map(tokenize_function, batched=True)

print(tokenized_datasets_medical)

tokenized_datasets_medical_filtered = tokenized_datasets_medical.filter(lambda example, index: index % 5 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets_medical_filtered['train'].shape}")
print(f"Validation: {tokenized_datasets_medical_filtered['validation'].shape}")
print(f"Test: {tokenized_datasets_medical_filtered['test'].shape}")

print(tokenized_datasets_medical_filtered)

Map:   0%|          | 0/15959 [00:00<?, ? examples/s]

Map:   0%|          | 0/1774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1971 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 15959
    })
    validation: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 1971
    })
})


Filter:   0%|          | 0/15959 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1774 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1971 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (3192, 4)
Validation: (355, 4)
Test: (395, 4)
DatasetDict({
    train: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 3192
    })
    validation: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 355
    })
    test: Dataset({
        features: ['Problem', 'Answer', 'input_ids', 'labels'],
        num_rows: 395
    })
})


# Fine-tuning the model

In [31]:
os.environ["WANDB_DISABLED"] = "true"

training_args_medical = TrainingArguments(
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    report_to="mlflow",
    label_names=["labels"]
)

training_args_math =TrainingArguments(
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    report_to="mlflow",
    label_names=["labels"]
)

trainer_medical = train_models_helper(peft_model, training_args_medical, tokenized_datasets_medical_filtered)
trainer_math = train_models_helper(peft_model, training_args_math, tokenized_datasets_math_filtered)

pipe_medical = pipeline("text2text-generation", model=peft_model, tokenizer=tokenizer)
pipe_math = pipeline("text2text-generation", model=peft_model, tokenizer=tokenizer)

Device set to use cuda:0
The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGen

## Train and save in MLFlow

In [32]:
# Medical
medical_model_artifact_path = train_in_mlflow(pipe_medical, training_args_medical, trainer_medical, "In a patient presenting with hematuria, proteinuria, and hypertension, which condition is typically associated with normal serum complement levels?")

Step,Training Loss
1,34.75
2,42.5
3,41.25
4,44.75
5,47.25
6,37.5
7,39.75
8,42.75
9,41.25
10,43.5


2025/04/25 15:07:56 INFO mlflow.transformers: Overriding save_pretrained to False for PEFT models, following the Transformers behavior. The PEFT adaptor and config will be saved, but the base model weights will not and reference to the HuggingFace Hub repository will be logged instead.
2025/04/25 15:07:56 INFO mlflow.transformers: Skipping saving pretrained model weights to disk as the save_pretrained argumentis set to False. The reference to the HuggingFace Hub repository google/flan-t5-small will be logged instead.
2025/04/25 15:07:57 INFO mlflow.transformers.signature: Running model prediction to infer the model output signature with a timeout of 180 seconds. You can specify a different timeout by setting the environment variable MLFLOW_INPUT_EXAMPLE_INFERENCE_TIMEOUT.
  prediction = generate_signature_output(
2025/04/25 15:07:57 INFO mlflow.transformers: A local checkpoint path or PEFT model is given as the `transformers_model`. To avoid loading the full model into memory, we don't

Logged to: /content/mlruns/1/5ad6a5b04c4e4e38a20ae73f119750a9/artifacts


In [33]:
# Math
math_model_artifact_path = train_in_mlflow(pipe_math, training_args_math, trainer_math, "What is 1 + 2 ?")

Step,Training Loss
1,37.5
2,38.0
3,38.0
4,37.75
5,37.5
6,37.5
7,37.25
8,36.75
9,37.25
10,37.25


2025/04/25 15:17:18 INFO mlflow.transformers: Overriding save_pretrained to False for PEFT models, following the Transformers behavior. The PEFT adaptor and config will be saved, but the base model weights will not and reference to the HuggingFace Hub repository will be logged instead.
2025/04/25 15:17:18 INFO mlflow.transformers: Skipping saving pretrained model weights to disk as the save_pretrained argumentis set to False. The reference to the HuggingFace Hub repository google/flan-t5-small will be logged instead.
2025/04/25 15:17:19 INFO mlflow.transformers.signature: Running model prediction to infer the model output signature with a timeout of 180 seconds. You can specify a different timeout by setting the environment variable MLFLOW_INPUT_EXAMPLE_INFERENCE_TIMEOUT.
  prediction = generate_signature_output(
2025/04/25 15:17:19 INFO mlflow.transformers: A local checkpoint path or PEFT model is given as the `transformers_model`. To avoid loading the full model into memory, we don't

Logged to: /content/mlruns/1/f7d0b8b1ecdd40458b9be546a8a9d38e/artifacts


# Inference with MLFlow

In [34]:
# Reload models
model_medical = mlflow.transformers.load_model(medical_model_artifact_path)
model_math = mlflow.transformers.load_model(math_model_artifact_path)

2025/04/25 15:22:32 INFO mlflow.transformers: 'runs:/5ad6a5b04c4e4e38a20ae73f119750a9/model-flan-t5-small-finetuned' resolved as '/content/mlruns/1/5ad6a5b04c4e4e38a20ae73f119750a9/artifacts/model-flan-t5-small-finetuned'
Device set to use cuda:0
2025/04/25 15:22:33 INFO mlflow.transformers: 'runs:/f7d0b8b1ecdd40458b9be546a8a9d38e/model-flan-t5-small-finetuned' resolved as '/content/mlruns/1/f7d0b8b1ecdd40458b9be546a8a9d38e/artifacts/model-flan-t5-small-finetuned'
Device set to use cuda:0


In [35]:
accelerator = Accelerator()

model_medical = accelerator.prepare(model_medical)
model_math = accelerator.prepare(model_math)

In [36]:
test_set_math = tokenized_datasets_math_filtered['test']
test_set_medical = tokenized_datasets_medical_filtered['test']

In [37]:
#small_test_set = test_set_math.select(range(10))
math_results_df, math_accuracy = inference_with_mlflow(test_set_math, model_math, math_model_artifact_path)

Generating answers:   2%|▏         | 10/657 [00:00<00:44, 14.67it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating answers: 100%|██████████| 657/657 [01:52<00:00,  5.85it/s]

Inference complete. Accuracy: 1.37%





In [39]:
#small_test_set = test_set_medical.select(range(10))
medical_results_df, medical_accuracy, medical_rouge = inference_with_mlflow(test_set_medical, model_medical, medical_model_artifact_path, True)

Generating answers:  59%|█████▉    | 233/395 [01:48<00:36,  4.41it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors
Generating answers: 100%|██████████| 395/395 [02:50<00:00,  2.32it/s]


Inference complete. Accuracy: 0.0%


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [40]:
math_results_df.head(10)

Unnamed: 0,question,answer,generated_answer
0,"if two integers x , y ( x > y ) are selected f...",120,10
1,"a sum of money is to be divided among ann , bo...",22,$ 32
2,the sum of two numbers is 528 and their h . c ...,4,0
3,there are 18 stations between ernakulam and ch...,380,.
4,"a sun is divided among x , y and z in such a w...",195,6000
5,"in one hour , a boat goes 14 km / hr along the...",11 km / hr,speed
6,an equal number of desks and bookcases are to ...,1,15
7,a person starting with rs . 64 and making 6 be...,rs . 37,winning
8,"robert ate 13 chocolates , nickel ate 4 chocol...",c ) 9,1
9,the difference in compound interest earned on ...,360,40


In [41]:
medical_results_df.head(10)

Unnamed: 0,question,answer,generated_answer
0,A 30-year-old G3P2 woman delivered a term baby...,"In this scenario, considering the critical nat...",B
1,What is the appropriate management for a full-...,In the scenario of a full-term pregnancy with ...,Pregnancy
2,"A 7-month-old boy presents with fever, chills,...",In a 7-month-old boy experiencing his first en...,immune system
3,An 83-year-old man with a history of severe de...,In the case of an 83-year-old man with severe ...,a sedative
4,A 58-year-old woman with hypertension managed ...,The selective α2 adrenergic receptor agonist l...,selective 2 adrenergic receptor agonist
5,A 54-year old woman complains of hot flashes a...,In considering treatment options for a 54-year...,A
6,Which drug used for the treatment of endometri...,"Oral contraceptive pills, commonly used in the...",hepatic cloning
7,A 63-year-old man with a history of hypertensi...,The symptoms described are consistent with cla...,Rest
8,What are some true characteristics of Pertussi...,"Pertussis, commonly known as whooping cough, h...",a bacterial
9,In an outbreak of cholera in a village with a ...,"In the cholera outbreak you're evaluating, the...",69.8


In [114]:
math_results_df.to_csv('inference_trained_math_results.csv', index=False)
medical_results_df.to_csv('inference_trained_medical_results.csv', index=False)

In [115]:
medical_rouge

{'rouge1': np.float64(0.029034657348058653),
 'rouge2': np.float64(0.006293447713279688),
 'rougeL': np.float64(0.027105917042534077),
 'rougeLsum': np.float64(0.027746624380772147)}

In [None]:
# zero-shot
# rouge1: 3.18%
# rouge2: 0.77%
# rougeL: 2.99%
# rougeLsum: 3.04%


# 1 epoch
# rouge1: 3.27%
# rouge2: 0.8%
# rougeL: 3.04%
# rougeLsum: 3.14%


# 10 epochs
# rouge1: 3.21%
# rouge2: 0.73%
# rougeL: 3.03%
# rougeLsum: 3.08%

# Zero-shot testing

In [43]:
original_math = pd.read_csv('/content/inference_original_math_results.csv')
original_medical = pd.read_csv('/content/inference_original_medical_results.csv')

In [45]:
original_math.head(10)

Unnamed: 0,question,answer,generated_answer
0,"if two integers x , y ( x > y ) are selected f...",120,5
1,"a sum of money is to be divided among ann , bo...",22,32
2,the sum of two numbers is 528 and their h . c ...,4,0
3,there are 18 stations between ernakulam and ch...,380,0
4,"a sun is divided among x , y and z in such a w...",195,0
5,"in one hour , a boat goes 14 km / hr along the...",11 km / hr,.
6,an equal number of desks and bookcases are to ...,1,15
7,a person starting with rs . 64 and making 6 be...,rs . 37,a winning percentage
8,"robert ate 13 chocolates , nickel ate 4 chocol...",c ) 9,4
9,the difference in compound interest earned on ...,360,3 times its current value


In [46]:
original_medical.head(10)

Unnamed: 0,question,answer,generated_answer
0,Based on the case study of the 60-year-old Cau...,The best predictor of the 60-year-old man atte...,a).
1,A 72-year-old man with a history of benign pro...,Based on the clinical history and laboratory f...,hypertension
2,What test would you use to determine if a reti...,To determine if a retinoblastoma with a single...,a syringe
3,A 51-year-old male presents to his primary car...,The best medication for this patient would be ...,C
4,What is the best management approach for a 35-...,To best manage a 35-year-old male with chronic...,a sedative
5,A 23-year-old male experienced severe chest pa...,The symptoms experienced by the 23-year-old ma...,He was admitted to the emergency department
6,What drug would you prescribe to a highway tru...,In a situation where a highway truck driver is...,a sneezing medication
7,"In a newborn male with an imperforate anus, wh...","In a newborn male with an imperforate anus, wh...",a sedation
8,A 40 year old tobacco chewer was given a routi...,The most fitting diagnosis for the 40-year-old...,D
9,An 8-year-old boy presents with swelling in th...,"Based on the provided clinical details, the mo...",aplastic femoral swelling


In [49]:
correct = (original_math['answer'] == original_math['generated_answer']).sum()
total = len(original_math)
accuracy = correct / total
print(f"Accuracy of the zero-shot model on a math task: {round(accuracy * 100, 2)}%")

correct = (math_results_df['answer'] == math_results_df['generated_answer']).sum()
total = len(math_results_df)
accuracy = correct / total
print(f"Accuracy of the fine-tuned model on a math task: {round(accuracy * 100, 2)}%")

Accuracy of the zero-shot model on a math task: 0.91%
Accuracy of the fine-tuned model on a math task: 1.37%


In [52]:
rouge = load("rouge")
rouge_result = rouge.compute(
    predictions=original_medical["generated_answer"].tolist(),
    references=original_medical["answer"].tolist()
)


print("ROUGE scores of the zero-shot model on a medical reasoning task:")
for key, value in rouge_result.items():
    print(f"{key}: {round(value * 100, 2)}%")

rouge_result = rouge.compute(
    predictions=medical_results_df["generated_answer"].tolist(),
    references=medical_results_df["answer"].tolist()
)

print("")
print("-----------------------------------------------------------------")
print("")

print("ROUGE scores of the fine-tuned model on a medical reasoning task:")
for key, value in rouge_result.items():
    print(f"{key}: {round(value * 100, 2)}%")

ROUGE scores of the zero-shot model on a medical reasoning task:
rouge1: 2.88%
rouge2: 0.65%
rougeL: 2.7%
rougeLsum: 2.76%

-----------------------------------------------------------------

ROUGE scores of the fine-tuned model on a medical reasoning task:
rouge1: 2.54%
rouge2: 0.55%
rougeL: 2.4%
rougeLsum: 2.42%


In [104]:
test_set = tokenized_datasets_math_filtered['test']

# Inference
with mlflow.start_run():
    results = []

    for example in tqdm(test_set, desc="Generating answers", total=len(test_set)):
        question = example['Problem']
        true_answer = example['Answer']

        prompt = f"""Answer the following question:

{question}

Answer:"""

        # Model is already a pipeline; no need for tokenizer or .to(device)
        output = pipe_math(prompt, max_new_tokens=700)
        generated_answer = output[0]['generated_text'].strip()

        results.append({
            "question": question,
            "answer": true_answer.strip(),
            "generated_answer": generated_answer
        })

    # Convert to DataFrame
    results_math_df = pd.DataFrame(results)

    # Save and log CSV
    csv_path = "inference_original_math_results.csv"
    results_math_df.to_csv(csv_path, index=False)
    mlflow.log_artifact(csv_path)

    # Accuracy (simple string match)
    correct = (results_math_df['answer'] == results_math_df['generated_answer']).sum()
    total = len(results_math_df)
    accuracy = correct / total
    mlflow.log_metric("inference_accuracy", accuracy)

    print(f"Inference complete. Accuracy: {round(accuracy * 100, 2)}%")

Generating answers: 100%|██████████| 657/657 [05:33<00:00,  1.97it/s]

Inference complete. Accuracy: 0.91%





In [108]:
results_math_df.head()

Unnamed: 0,question,answer,generated_answer
0,"if two integers x , y ( x > y ) are selected f...",120,5
1,"a sum of money is to be divided among ann , bo...",22,32
2,the sum of two numbers is 528 and their h . c ...,4,0
3,there are 18 stations between ernakulam and ch...,380,0
4,"a sun is divided among x , y and z in such a w...",195,0


In [106]:
test_set = tokenized_datasets_medical_filtered['test']

# Inference
with mlflow.start_run():
    results = []

    for example in tqdm(test_set, desc="Generating answers", total=len(test_set)):
        question = example['Problem']
        true_answer = example['Answer']

        prompt = f"""Answer the following question:

{question}

Answer:"""

        # Model is already a pipeline; no need for tokenizer or .to(device)
        output = pipe_medical(prompt, max_new_tokens=700)
        generated_answer = output[0]['generated_text'].strip()

        results.append({
            "question": question,
            "answer": true_answer.strip(),
            "generated_answer": generated_answer
        })

    # Convert to DataFrame
    results_medical_df = pd.DataFrame(results)

    # Save and log CSV
    csv_path = "inference_original_medical_results.csv"
    results_medical_df.to_csv(csv_path, index=False)
    mlflow.log_artifact(csv_path)

    # Accuracy (simple string match)
    correct = (results_medical_df['answer'] == results_medical_df['generated_answer']).sum()
    total = len(results_medical_df)
    accuracy = correct / total
    mlflow.log_metric("inference_accuracy", accuracy)

    print(f"Inference complete. Accuracy: {round(accuracy * 100, 2)}%")

    # Compute and log ROUGE scores
    rouge = load("rouge")
    rouge_result = rouge.compute(
        predictions=results_medical_df["generated_answer"].tolist(),
        references=results_medical_df["answer"].tolist()
    )

    for key, value in rouge_result.items():
        mlflow.log_metric(f"rouge_{key}", value)  # Keep as float (not string %)

Generating answers: 100%|██████████| 395/395 [03:42<00:00,  1.77it/s]


Inference complete. Accuracy: 0.0%


In [109]:
results_medical_df.head()

Unnamed: 0,question,answer,generated_answer
0,Based on the case study of the 60-year-old Cau...,The best predictor of the 60-year-old man atte...,a).
1,A 72-year-old man with a history of benign pro...,Based on the clinical history and laboratory f...,hypertension
2,What test would you use to determine if a reti...,To determine if a retinoblastoma with a single...,a syringe
3,A 51-year-old male presents to his primary car...,The best medication for this patient would be ...,C
4,What is the best management approach for a 35-...,To best manage a 35-year-old male with chronic...,a sedative


In [47]:
from google.colab import files
import shutil
import os

def download_mlruns(mlruns_path="/content/mlruns", zip_name="mlruns.zip"):
    """
    Downloads the mlruns folder from Colab.

    Args:
        mlruns_path (str): Path to the mlruns folder in Colab.
        zip_name (str): Name of the zip archive to create.
    """
    if not os.path.exists(mlruns_path):
        print(f"Error: {mlruns_path} does not exist.")
        return

    try:
        # Create a ZIP archive of the mlruns folder.
        shutil.make_archive("mlruns", 'zip', mlruns_path)
        # Download the ZIP file.
        files.download(zip_name)
    except Exception as e:
        print(f"Error downloading mlruns: {e}")

download_mlruns()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>