In [1]:
!pip install transformers==4.36.2 datasets==2.16.1 peft==0.7.1 evaluate==0.4.1 torch==2.1.2 psutil==5.9.8 numpy==1.26.3 accelerate==0.25.0 sacrebleu

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting torch==2.1.2
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting psutil==5.9.8
  Downloading psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting numpy==1.26.3
  Downloading numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m3.6 MB/

In [2]:
import torch
import numpy as np
import psutil
import sacrebleu
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, TrainerCallback, EarlyStoppingCallback
from peft import LoraConfig, TaskType, get_peft_model
from evaluate import load

2025-06-10 21:47:12.353492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749592032.532286      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749592032.585464      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from accelerate import init_empty_weights

torch.backends.cuda.matmul.allow_tf32 = True  # Enable TF32 for matmul
torch.backends.cudnn.allow_tf32 = True  # Enable TF32 for convolutions
print(f"TF32 enabled: {torch.backends.cuda.matmul.allow_tf32}")

TF32 enabled: True


In [4]:
# Add this right after your imports
print("Environment check:")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"BF16 supported: {torch.cuda.is_bf16_supported()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# Set appropriate precision
if torch.cuda.is_bf16_supported():
    torch.set_float32_matmul_precision('medium')
    print("Using BF16 precision where possible")
else:
    print("BF16 not supported, using FP16")

Environment check:
PyTorch version: 2.1.2+cu121
CUDA available: True
CUDA version: 12.1
BF16 supported: False
GPU: Tesla T4
BF16 not supported, using FP16


In [5]:
# Memory monitoring function from teacher's code
def print_memory_footprint():
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3)  # Convert to GB
        gpu_memory_cached = torch.cuda.memory_reserved() / (1024 ** 3)  # Cached memory
        print(f"[GPU] Memory Allocated: {gpu_memory:.2f} GB, Cached: {gpu_memory_cached:.2f} GB")
    else:
        print("[GPU] No GPU detected.")
    memory = psutil.virtual_memory()
    used_memory_gb = memory.used / (1024 ** 3)  # Convert to GB
    total_memory_gb = memory.total / (1024 ** 3)
    print(f"[CPU] Memory Usage: {used_memory_gb:.2f} GB / {total_memory_gb:.2f} GB")

In [6]:
# Set memory constraints
torch.cuda.set_per_process_memory_fraction(1.0, 0)  # Use maximum available memory
torch.cuda.memory_max_split_size_mb = 64  # Avoid fragmentation

In [7]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [8]:
# Load SQuAD dataset
dataset = load_dataset("squad")

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token



tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [10]:
# Preprocess function for SQuAD
def tokenize_function(examples):
    # Format: "Question: [question] Context: [context] Answer: [answer]"
    texts = [f"Question: {q} Context: {c} Answer: {a['text'][0]}"
             for q, c, a in zip(examples["question"], examples["context"], examples["answers"])]
    return tokenizer(texts, truncation=True, max_length=512)

In [11]:
# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["id", "title", "context", "question", "answers"])
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))  # Subset for compute
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [12]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
# Define 10 evaluation prompts and reference answers (not from SQuAD, covering diverse topics)
eval_prompts = [
    "Question: What is the capital city of Japan? Answer:",
    "Question: Who wrote the novel 'Pride and Prejudice'? Answer:",
    "Question: What is the chemical symbol for gold? Answer:",
    "Question: In which year did the Titanic sink? Answer:",
    "Question: What is the largest mammal on Earth? Answer:",
    "Question: Who painted the Mona Lisa? Answer:",
    "Question: What is the main source of energy for Earth's climate system? Answer:",
    "Question: What is the longest river in the world? Answer:",
    "Question: Who discovered penicillin? Answer:",
    "Question: What is the primary language spoken in Brazil? Answer:"
]
reference_answers = [
    "The capital city of Japan is Tokyo.",
    "The novel 'Pride and Prejudice' was written by Jane Austen.",
    "The chemical symbol for gold is Au.",
    "The Titanic sank in 1912.",
    "The largest mammal on Earth is the blue whale.",
    "The Mona Lisa was painted by Leonardo da Vinci.",
    "The main source of energy for Earth's climate system is the Sun.",
    "The longest river in the world is the Nile.",
    "Penicillin was discovered by Alexander Fleming.",
    "The primary language spoken in Brazil is Portuguese."
]

In [14]:
def evaluate_model(model_path, prompts, references):
    try:
        # Load model with FP16 precision for T4
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,  # Force FP16 for T4
            device_map="auto"
        ).eval()

        bleu_scores = []
        responses = []

        for prompt, ref in zip(prompts, references):
            try:
                input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

                with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16):
                    output = model.generate(
                        input_ids,
                        max_new_tokens=50,
                        pad_token_id=tokenizer.eos_token_id,
                        temperature=0.7,
                        do_sample=True,
                        top_p=0.9,
                        repetition_penalty=1.1
                    )

                pred = tokenizer.decode(output[0], skip_special_tokens=True)

                # Improved answer extraction
                pred_answer = pred.split("Answer:")[-1].strip() if "Answer:" in pred else pred.strip()

                # More robust BLEU calculation
                try:
                    bleu = sacrebleu.corpus_bleu(
                        [pred_answer],
                        [[ref]],
                        tokenize="flores101"  # Better tokenization for QA
                    ).score / 100
                except:
                    bleu = 0.0

                bleu_scores.append(bleu)
                responses.append({
                    "prompt": prompt,
                    "response": pred_answer,
                    "reference": ref,
                    "bleu": bleu
                })

            except Exception as e:
                print(f"Error processing prompt '{prompt[:30]}...': {str(e)}")
                bleu_scores.append(0.0)
                responses.append({
                    "prompt": prompt,
                    "response": "ERROR",
                    "reference": ref,
                    "bleu": 0.0
                })

        return {
            "model": model_path,
            "avg_bleu": np.mean(bleu_scores) if bleu_scores else 0.0,
            "individual_bleu": bleu_scores,
            "responses": responses
        }

    except Exception as e:
        print(f"Error evaluating model {model_path}: {e}")
        return {
            "model": model_path,
            "avg_bleu": 0.0,
            "individual_bleu": [0.0]*len(prompts),
            "responses": []
        }

In [15]:
# Memory callback
class MemoryCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        print("\nMemory footprint after evaluation:")
        print_memory_footprint()

In [16]:
configurations = [
    # 5. Experimental Configuration (Low Temp)
    {
        "name": "Experimental-LowTemp",
        "lora": {
            "r": 16,
            "target_modules": ["q_proj", "v_proj"],
            "lora_alpha": 32,
            "lora_dropout": 0.1,
            "bias": "none"
        },
        "training": {
            "learning_rate": 7e-5,
            "batch_size": 8,
            "epochs": 4,
            "gradient_accumulation_steps": 2,
            "warmup_ratio": 0.1,
            "weight_decay": 0.01,
            "early_stopping_patience": 3,
            "early_stopping_threshold": 0.005,
            "generation_temp": 0.5,  # Very deterministic
            "train_temp": 0.8  # Lower temp during training
        },
        "output_dir": "./tinyllama-qa-exp-lowtemp"
    }]

#     # 2. Balanced Configuration
#     {
#         "name": "Balanced",
#         "lora": {
#             "r": 16,
#             "target_modules": ["q_proj", "k_proj", "v_proj"],
#             "lora_alpha": 32,
#             "lora_dropout": 0.05,
#             "bias": "none"
#         },
#         "training": {
#             "learning_rate": 5e-5,
#             "batch_size": 12,
#             "epochs": 5,
#             "gradient_accumulation_steps": 3,
#             "warmup_ratio": 0.15,
#             "weight_decay": 0.005,
#             "early_stopping_patience": 3,
#             "early_stopping_threshold": 0.005,
#             "generation_temp": 0.8,
#             "train_temp": 1.0
#         },
#         "output_dir": "./tinyllama-qa-balanced"
#     },

#     # 3. Aggressive Configuration (High Rank)
#     {
#         "name": "Aggressive",
#         "lora": {
#             "r": 32,
#             "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
#             "lora_alpha": 64,
#             "lora_dropout": 0.2,
#             "bias": "lora_only"
#         },
#         "training": {
#             "learning_rate": 1e-4,
#             "batch_size": 16,
#             "epochs": 6,
#             "gradient_accumulation_steps": 4,
#             "warmup_ratio": 0.2,
#             "weight_decay": 0.0,
#             "early_stopping_patience": 2,
#             "early_stopping_threshold": 0.02,
#             "generation_temp": 0.9,
#             "train_temp": 1.1
#         },
#         "output_dir": "./tinyllama-qa-aggressive"
#     },

#     # 4. Experimental Configuration (Very High Rank)
#     {
#         "name": "Experimental-HighRank",
#         "lora": {
#             "r": 64,
#             "target_modules": ["q_proj", "v_proj", "gate_proj"],
#             "lora_alpha": 128,
#             "lora_dropout": 0.15,
#             "bias": "all"
#         },
#         "training": {
#             "learning_rate": 2e-5,
#             "batch_size": 4,
#             "epochs": 8,
#             "gradient_accumulation_steps": 8,
#             "warmup_ratio": 0.3,
#             "weight_decay": 0.001,
#             "early_stopping_patience": 4,
#             "early_stopping_threshold": 0.001,
#             "generation_temp": 1.0,
#             "train_temp": 1.2
#         },
#         "output_dir": "./tinyllama-qa-exp-highrank"
#     },

#     # 5. Experimental Configuration (Low Temp)
#     {
#         "name": "Experimental-LowTemp",
#         "lora": {
#             "r": 16,
#             "target_modules": ["q_proj", "v_proj"],
#             "lora_alpha": 32,
#             "lora_dropout": 0.1,
#             "bias": "none"
#         },
#         "training": {
#             "learning_rate": 7e-5,
#             "batch_size": 8,
#             "epochs": 5,
#             "gradient_accumulation_steps": 2,
#             "warmup_ratio": 0.1,
#             "weight_decay": 0.01,
#             "early_stopping_patience": 3,
#             "early_stopping_threshold": 0.005,
#             "generation_temp": 0.5,  # Very deterministic
#             "train_temp": 0.8  # Lower temp during training
#         },
#         "output_dir": "./tinyllama-qa-exp-lowtemp"
#     }
# ]

In [17]:
import os
os.environ["ACCELERATE_USE_F16"] = "true"

In [18]:
# Train and evaluate five trials
results = []
base_model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Evaluating base model...")
# Load base model in float32 to avoid Half precision issues
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float32).to(device)
results.append(evaluate_model(base_model_path, eval_prompts, reference_answers))
del base_model
torch.cuda.empty_cache()

for i, config in enumerate(configurations, 1):
    print(f"\nStarting Trial {i}: {config['name']}")
    print_memory_footprint()

    # Load model in float32
    model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float32).to(device)

    # LoRA configuration
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=config["lora"]["r"],
        lora_alpha=config["lora"]["lora_alpha"],
        lora_dropout=config["lora"]["lora_dropout"],
        target_modules=config["lora"]["target_modules"],
        bias=config["lora"]["bias"]
    )
    model = get_peft_model(model, lora_config)

    # Ensure LoRA parameters require gradients
    for name, param in model.named_parameters():
        if "lora" in name:
            param.requires_grad = True

    # Training arguments
    training_args = TrainingArguments(
        output_dir=config["output_dir"],
        per_device_train_batch_size=config["training"]["batch_size"],
        per_device_eval_batch_size=config["training"]["batch_size"],
        num_train_epochs=config["training"]["epochs"],
        learning_rate=config["training"]["learning_rate"],
        gradient_accumulation_steps=config["training"]["gradient_accumulation_steps"],
        warmup_ratio=config["training"]["warmup_ratio"],
        weight_decay=config["training"]["weight_decay"],
        logging_dir=f"./logs/trial{i}",
        logging_steps=250,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to="none",
        remove_unused_columns=False,
        bf16=False,
        fp16=False,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        load_best_model_at_end = True
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[MemoryCallback(), EarlyStoppingCallback(
            early_stopping_patience=config["training"]["early_stopping_patience"],
            early_stopping_threshold=config["training"]["early_stopping_threshold"]
        )]
    )

    # Train
    trainer.train()
    trainer.save_model(config["output_dir"])

    # Evaluate with specified generation temperature
    print(f"Evaluating Trial {i}...")
    results.append(evaluate_model(
        config["output_dir"],
        eval_prompts,
        reference_answers
    ))

    # Clear memory
    del model, trainer
    torch.cuda.empty_cache()
    print_memory_footprint()

Evaluating base model...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Starting Trial 1: Experimental-LowTemp
[GPU] Memory Allocated: 0.96 GB, Cached: 1.01 GB
[CPU] Memory Usage: 4.37 GB / 31.35 GB


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
0,2.1288,2.03215
2,2.0389,2.02422
3,2.037,2.024124



Memory footprint after evaluation:
[GPU] Memory Allocated: 4.26 GB, Cached: 13.68 GB
[CPU] Memory Usage: 6.99 GB / 31.35 GB





Memory footprint after evaluation:
[GPU] Memory Allocated: 4.17 GB, Cached: 13.68 GB
[CPU] Memory Usage: 6.99 GB / 31.35 GB





Memory footprint after evaluation:
[GPU] Memory Allocated: 4.26 GB, Cached: 13.68 GB
[CPU] Memory Usage: 6.99 GB / 31.35 GB





Memory footprint after evaluation:
[GPU] Memory Allocated: 4.17 GB, Cached: 13.68 GB
[CPU] Memory Usage: 7.01 GB / 31.35 GB
Evaluating Trial 1...




[GPU] Memory Allocated: 0.97 GB, Cached: 1.04 GB
[CPU] Memory Usage: 7.11 GB / 31.35 GB


In [19]:
# Print results
print("\nEvaluation Results:")
model_names = ["Base Model"] + [config["name"] for config in configurations]

for i, (result, model_name) in enumerate(zip(results, model_names)):
    try:
        avg_bleu = result.get("avg_bleu", 0.0)
        individual_bleu = result.get("individual_bleu", [])
        temperature = configurations[i-1]["training"]["generation_temp"] if i > 0 else 1.0  # Default temp for base model
        print(f"Model: {model_name}, Average BLEU: {avg_bleu:.4f}, Temperature: {temperature:.2f}")
        print(f"Individual BLEU Scores: {[f'{score:.4f}' for score in individual_bleu]}")
    except (AttributeError, TypeError) as e:
        print(f"Error printing results for {model_name}: {str(e)}")


Evaluation Results:
Model: Base Model, Average BLEU: 0.1803, Temperature: 1.00
Individual BLEU Scores: ['0.0000', '0.0717', '0.0375', '0.2585', '0.3784', '0.0000', '0.7917', '0.1075', '0.0456', '0.1123']
Model: Experimental-LowTemp, Average BLEU: 0.0820, Temperature: 0.50
Individual BLEU Scores: ['0.0143', '0.1596', '0.0094', '0.0336', '0.0209', '0.0423', '0.2894', '0.1296', '0.0967', '0.0240']


In [20]:
# Save results to file
import json
with open("evaluation_results.json", "w") as f:
    json.dump(results, f, indent=4)