<a href="https://colab.research.google.com/github/umarhashmi2002/Alzimer_disease/blob/main/Copy_of_Csv_Trained_Fyp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Install required libraries (if not already installed on Kaggle/Colab)
#!pip install transformers datasets evaluate --quiet

import os
import pandas as pd
import numpy as np
import torch
import math

from sklearn.model_selection import train_test_split

# Hugging Face Transformers and Datasets
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments
)
!pip install transformers datasets evaluate --quiet
from datasets import Dataset
from transformers import TrainingArguments

# To evaluate sequence generation metrics (BLEU, ROUGE), we’ll use 'evaluate'
import evaluate

print("All libraries imported successfully!")


All libraries imported successfully!


In [19]:
# Define the path to your CSV file
csv_path = "/content/ADNI1_Screening_1.5T_1_29_2024.csv"
# Make sure to update 'csv_path' to the correct location of your CSV file

# Load the CSV into a pandas DataFrame
df = pd.read_csv(csv_path)

# Display the shape and first few rows for inspection
print("DataFrame shape:", df.shape)
df.head()


DataFrame shape: (1075, 12)


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I62666,013_S_1275,MCI,F,79,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,2/22/2007,NiFTI,1/29/2024
1,I119268,121_S_1322,MCI,F,72,sc,MRI,MPR; ; N3; Scaled_2,Processed,3/02/2007,NiFTI,1/29/2024
2,I59697,116_S_0649,MCI,M,87,sc,MRI,MPR; GradWarp; N3; Scaled,Processed,7/24/2006,NiFTI,1/29/2024
3,I68581,099_S_0880,MCI,M,84,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,10/05/2006,NiFTI,1/29/2024
4,I60760,029_S_1318,MCI,F,83,sc,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,2/17/2007,NiFTI,1/29/2024


In [20]:
# For this example, we assume the DataFrame has columns: 'Group', 'Age', 'Sex', and 'Description'.
# Create an input text prompt that combines group, age, and sex
def create_prompt(row):
    return f"Group: {row['Group']}. Age: {row['Age']}. Sex: {row['Sex']}."

# Apply the function to create a new column for the input text
df['input_text'] = df.apply(create_prompt, axis=1)

# Check the new columns
print(df[['input_text', 'Description']].head())


                     input_text                                 Description
0  Group: MCI. Age: 79. Sex: F.    MPR; GradWarp; B1 Correction; N3; Scaled
1  Group: MCI. Age: 72. Sex: F.                         MPR; ; N3; Scaled_2
2  Group: MCI. Age: 87. Sex: M.                   MPR; GradWarp; N3; Scaled
3  Group: MCI. Age: 84. Sex: M.    MPR; GradWarp; B1 Correction; N3; Scaled
4  Group: MCI. Age: 83. Sex: F.  MPR-R; GradWarp; B1 Correction; N3; Scaled


In [21]:
# Split the dataset into train and validation sets (80-20 split)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)


Training set shape: (860, 13)
Validation set shape: (215, 13)


In [22]:
# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[['input_text', 'Description']])
val_dataset = Dataset.from_pandas(val_df[['input_text', 'Description']])

print("Training dataset:", train_dataset)
print("Validation dataset:", val_dataset)


Training dataset: Dataset({
    features: ['input_text', 'Description', '__index_level_0__'],
    num_rows: 860
})
Validation dataset: Dataset({
    features: ['input_text', 'Description', '__index_level_0__'],
    num_rows: 215
})


In [23]:
# Load the pre-trained T5 tokenizer
model_name = "t5-small"  # You can change this to any other T5 variant (e.g., "t5-base", "t5-large", etc.)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Maximum token lengths for inputs and outputs – adjust if necessary
max_input_length = 64
max_target_length = 128

def preprocess_function(examples):
    # Tokenize the input texts (prompts)
    model_inputs = tokenizer(examples['input_text'], max_length=max_input_length, truncation=True)

    # Tokenize the target texts (descriptions) with labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['Description'], max_length=max_target_length, truncation=True)

    # T5 uses -100 as the ignore index for labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Remove the original columns if desired (for faster training)
train_dataset = train_dataset.remove_columns(["input_text", "Description", "__index_level_0__"])
val_dataset = val_dataset.remove_columns(["input_text", "Description", "__index_level_0__"])


Map:   0%|          | 0/860 [00:00<?, ? examples/s]



Map:   0%|          | 0/215 [00:00<?, ? examples/s]

In [24]:
# Load T5 model for conditional generation
model = T5ForConditionalGeneration.from_pretrained(model_name)
print(f"Model {model_name} loaded successfully!")


Model t5-small loaded successfully!


In [25]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

import transformers
print("Transformers version:", transformers.__version__)


Transformers version: 4.48.3


In [26]:
import transformers
print(transformers.__version__)


4.48.3


In [27]:
# We’ll enable early stopping by specifying 'load_best_model_at_end' and a 'metric_for_best_model'
# The example here monitors 'eval_loss' (you could also monitor a custom metric like ROUGE or BLEU)
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_finetuned_adni",
    evaluation_strategy="steps",     # or "epoch" if you prefer
    eval_steps=100,                  # Evaluate every 100 steps
    logging_steps=50,
    save_steps=200,
    num_train_epochs=3,             # Increase if needed
    per_device_train_batch_size=8,   # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available

    # Early stopping & best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,          # Because lower eval_loss is better
)




In [29]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predictions = np.array(predictions)
    if predictions.ndim == 3:
        predictions = predictions[:, 0, :]

    # Clip predictions to valid range [0, vocab_size - 1]
    predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)

    # Replace -100 in labels with the pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_results = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )
    rouge_results = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    result = {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "rougeLsum": rouge_results["rougeLsum"]
    }
    return result


In [30]:
# Start training – this may take some time depending on dataset size and hardware
train_result = trainer.train()

# Trainer saves the best model at the end if 'load_best_model_at_end=True'
# You can also explicitly save the model here if desired:
# trainer.save_model("./t5_finetuned_adni_best")

# Print training stats
print("Training completed. Metrics:")
print(train_result.metrics)


Step,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Rougelsum
100,0.2538,0.160404,0.793989,0.877606,0.732628,0.877413,0.877046
200,0.2013,0.141484,0.793989,0.877606,0.732628,0.877413,0.877046
300,0.1808,0.149668,0.793989,0.877606,0.732628,0.877413,0.877046


Step,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed. Metrics:
{'train_runtime': 80.8189, 'train_samples_per_second': 31.923, 'train_steps_per_second': 4.009, 'total_flos': 10911932743680.0, 'train_loss': 0.23389334590346725, 'epoch': 3.0}


In [31]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# If we want to compute perplexity from eval_loss (if it exists)
perplexity = None
if eval_results.get("eval_loss") is not None:
    perplexity = math.exp(eval_results["eval_loss"])

eval_results["perplexity"] = perplexity

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value}")


Evaluation Results:
  eval_loss: 0.14148417115211487
  eval_bleu: 0.7939886729930983
  eval_rouge1: 0.8776061148154175
  eval_rouge2: 0.7326279405349172
  eval_rougeL: 0.8774126648545252
  eval_rougeLsum: 0.8770455900688459
  eval_runtime: 12.3539
  eval_samples_per_second: 17.403
  eval_steps_per_second: 2.186
  epoch: 3.0
  perplexity: 1.1519822695643804


In [32]:
def generate_prediction(example):
    input_ids = tokenizer(
        example['input_text'],
        return_tensors="pt",
        truncation=True,
        max_length=max_input_length
    ).input_ids
    input_ids = input_ids.to(model.device)
    outputs = model.generate(
        input_ids,
        max_length=max_target_length,
        num_beams=4,
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

# Select a few samples for demonstration
samples = val_df[['input_text', 'Description']].head(5)

for idx, row in samples.iterrows():
    pred = generate_prediction({"input_text": row['input_text']})
    print(f"Input: {row['input_text']}")
    print(f"Ground Truth: {row['Description']}")
    print(f"Prediction: {pred}\n")


Input: Group: MCI. Age: 81. Sex: M.
Ground Truth: MPR; GradWarp; N3; Scaled
Prediction: MPR; ; N3; Scaled_2

Input: Group: MCI. Age: 61. Sex: F.
Ground Truth: MPR; GradWarp; B1 Correction; N3; Scaled
Prediction: MPR; ; N3; Scaled_2

Input: Group: MCI. Age: 71. Sex: M.
Ground Truth: MPR; GradWarp; B1 Correction; N3; Scaled_2
Prediction: MPR; ; N3; Scaled_2

Input: Group: MCI. Age: 80. Sex: M.
Ground Truth: MPR; GradWarp; B1 Correction; N3; Scaled
Prediction: MPR; ; N3; Scaled_2

Input: Group: MCI. Age: 70. Sex: M.
Ground Truth: MPR-R; GradWarp; N3; Scaled
Prediction: MPR; ; N3; Scaled_2



In [33]:
# Save the model and tokenizer to the output directory for later use
model.save_pretrained("./t5_finetuned_adni")
tokenizer.save_pretrained("./t5_finetuned_adni")
print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!
