In [None]:
%pip install transformers datasets peft


In [None]:

pip install rake-nltk


In [None]:

%pip install tiktoken

In [None]:
%pip install transformers datasets peft

%pip install tiktoken


In [None]:

from datasets import Dataset, load_dataset

# Load the dataset
dataset = load_dataset("ArtifactAI/arxiv-physics-instruct-tune-30k")

# Convert the 'train' split to a DataFrame
df = dataset['train'].to_pandas()

# Structure the data
def structure_data(row):
    question = row['question']
    answer = row['answer']
    formatted_text = f"Can you answer this question: {question}? The answer is answer."
    return {
        "text": formatted_text,  # Store as string
        "label": {
            "prompt": formatted_text,  # Store as string
            "completion": answer  # Store as string
        }
    }

structured_data = df.apply(structure_data, axis=1).tolist()
dataset_dict = Dataset.from_dict({"data": structured_data})

# Split the dataset: 80% train, 20% val+test
train_val_test_split = dataset_dict.train_test_split(test_size=0.2)
train_data = train_val_test_split['train']

# Split the 20% into 10% validation and 10% test
val_test_split = train_val_test_split['test'].train_test_split(test_size=0.5)
val_data = val_test_split['train']
test_data = val_test_split['test']

# Function to check data integrity
def check_data_integrity(data):
    for idx, item in enumerate(data):
        if not isinstance(item['text'], str):
            print(f"Error in text at index {idx}: {item['text']}")
        if not isinstance(item['label']['prompt'], str):
            print(f"Error in prompt at index {idx}: {item['label']['prompt']}")
        if not isinstance(item['label']['completion'], str):
            print(f"Error in completion at index {idx}: {item['label']['completion']}")

check_data_integrity(structured_data)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30231 [00:00<?, ? examples/s]

In [None]:

import tiktoken

# Initialize the tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

# Define the tokenization function
def tokenize_function(examples):
    texts = examples['data']
    input_texts = [ex['text'] for ex in texts]
    prompts = [ex['label']['prompt'] for ex in texts]
    completions = [ex['label']['completion'] for ex in texts]

    # Tokenize inputs, prompts, and completions
    def encode(text):
        return tokenizer.encode(text)

    input_ids = [encode(text) for text in input_texts]
    prompt_ids = [encode(prompt) for prompt in prompts]
    completion_ids = [encode(completion) for completion in completions]

    # Handle padding and truncation
    max_length = 680
    pad_token_id = tokenizer.encode('[PAD]')[0] if tokenizer.encode('[PAD]') else tokenizer.eos_token_id

    def pad_and_truncate(ids):
        if len(ids) > max_length:
            return ids[:max_length]
        else:
            return ids + [pad_token_id] * (max_length - len(ids))

    padded_input_ids = [pad_and_truncate(ids) for ids in input_ids]
    padded_prompt_ids = [pad_and_truncate(ids) for ids in prompt_ids]
    padded_completion_ids = [pad_and_truncate(ids) for ids in completion_ids]

    # Generate attention masks
    attention_masks = [[1] * len(ids) + [0] * (max_length - len(ids)) for ids in padded_input_ids]

    return {
        "input_ids": padded_input_ids,
        "prompt_ids": padded_prompt_ids,
        "completion_ids": padded_completion_ids,
        "attention_mask": attention_masks
    }

# Apply the tokenization function to the dataset
tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_val_data = val_data.map(tokenize_function, batched=True)
tokenized_test_data = test_data.map(tokenize_function, batched=True)

# Convert to PyTorch Dataset
import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader

class CustomDataset(TorchDataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['completion_ids']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = CustomDataset(tokenized_train_data)
val_dataset = CustomDataset(tokenized_val_data)
test_dataset = CustomDataset(tokenized_test_data)


# Define DataLoader for batching
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

Map:   0%|          | 0/24184 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

Map:   0%|          | 0/3024 [00:00<?, ? examples/s]

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Load GPT-2 Model and Tokenizer
model_name_or_path = 'gpt2-medium'
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Configure LoRA Parameters with reduced rank
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Task type: causal language modeling
    inference_mode=False,          # Set to False for training mode
    r=3,                           # Reduced rank (previously 8)
    lora_alpha=16,                 # Scaled alpha (previously 32)
    lora_dropout=0.1,              # Dropout rate
    fan_in_fan_out=False           # Change parameter sharing method
)

# Apply PEFT with LoRA to the model
model = get_peft_model(model, peft_config)

# Print the number of trainable parameters
model.print_trainable_parameters()


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 294,912 || all params: 355,118,080 || trainable%: 0.0830




In [None]:

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch

# Define training parameters
lr = 2e-4  # Starting learning rate
num_epochs = 3
warmup_steps = 250 # Adjust based on dataset size and model complexity

# Optimizer and learning rate scheduler setup
optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

# Optionally, add gradient clipping
max_grad_norm = 1.0  # Adjust as needed
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

tensor(0.)

In [None]:

import os
# Define checkpointing functions
def save_checkpoint(model, optimizer, epoch, loss, checkpoint_dir):
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

def load_checkpoint(checkpoint_path, model, optimizer, device):
    if os.path.isfile(checkpoint_path):
        print(f"Loading checkpoint '{checkpoint_path}'")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        loss = checkpoint.get('loss', None)
        print(f"Loaded checkpoint '{checkpoint_path}' (epoch {start_epoch})")
        return start_epoch, loss
    else:
        print(f"No checkpoint found at '{checkpoint_path}', starting from scratch")
        return 0, None

# Device Setup - Efficient assignment based on availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the correct device (already assumed to be done)
model.to(device)

# Checkpoint Path Definition - Standard path construction
checkpoint_dir = '/content/drive/MyDrive/checkpointsgpt2'
os.makedirs(checkpoint_dir, exist_ok=True)
latest_checkpoint = os.path.join(checkpoint_dir, 'checkpoint_latest_epoch_2.pt')

# Loading Call - Simple and clear
start_epoch, _ = load_checkpoint(latest_checkpoint, model, optimizer, device)

No checkpoint found at '/content/drive/MyDrive/checkpointsgpt2/checkpoint_latest_epoch_2.pt', starting from scratch


In [None]:
import os
import torch
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# Define the path to the saved model
model_save_path = '/content/drive/MyDrive/checkpointsgpt2'
model_file = 'finetuned_model.pth'

# Load the saved model
def load_saved_model(save_path, model_file):
    """
    Loads the saved model from the specified path.

    Args:
        save_path (str): The path to the saved model.
        model_file (str): The file name of the saved model.

    Returns:
        torch.nn.Module: The loaded model.
    """
    # Create a new instance of the LoRA model
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,  # Task type: causal language modeling
        inference_mode=False,          # Set to False for training mode
        r=3,                           # Reduced rank (previously 8)
        lora_alpha=16,                 # Scaled alpha (previously 32)
        lora_dropout=0.1,              # Dropout rate
        fan_in_fan_out=False           # Change parameter sharing method
    )
    model = get_peft_model(AutoModelForCausalLM.from_pretrained('gpt2-medium'), peft_config)

    # Load the saved model state dict
    state_dict = torch.load(os.path.join(save_path, model_file), map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    # Load the state dict into the model
    model.load_state_dict(state_dict)

    # Set the model to evaluation mode
    model.eval()

    return model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the saved model
loaded_model = load_saved_model(model_save_path, model_file).to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [None]:

from torch.cuda.amp import autocast, GradScaler
import torch
from torch.optim import AdamW
from transformers import AdamW as TransformersAdamW  # Import specifically for Transformers
from tqdm import tqdm  # Import tqdm for progress bar
import os

# Initialize the GradScaler
scaler = GradScaler()
accumulation_steps = 4
# Training loop with validation and checkpointing
best_val_loss = float('inf')  # Initialize with a very large value

for epoch in range(start_epoch, num_epochs):
    total_loss = 0.0
    model.train()

    # Initialize tqdm with train_dataloader for progress bar
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit=' batches')

    # Training phase
    for step, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

        # Update tqdm progress bar
        progress_bar.set_postfix({'training_loss': total_loss / (step + 1)})

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}: Average training loss: {avg_train_loss:.4f}")

    # Save checkpoint at the end of each epoch
    save_checkpoint(model, optimizer, epoch, avg_train_loss, checkpoint_dir)

    # Save the latest checkpoint after each epoch
    latest_checkpoint_epoch = os.path.join(checkpoint_dir, f'checkpoint_latest_epoch_{epoch}.pt')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_train_loss
    }, latest_checkpoint_epoch)

print("Training complete.")


Epoch 1/3: 100%|██████████| 6046/6046 [56:42<00:00,  1.78 batches/s, training_loss=3.57]


Epoch 1/3: Average training loss: 3.5668
Checkpoint saved at /content/drive/MyDrive/checkpointsgpt2/checkpoint_epoch_0.pt


Epoch 2/3: 100%|██████████| 6046/6046 [56:43<00:00,  1.78 batches/s, training_loss=3.57]


Epoch 2/3: Average training loss: 3.5673
Checkpoint saved at /content/drive/MyDrive/checkpointsgpt2/checkpoint_epoch_1.pt


Epoch 3/3: 100%|██████████| 6046/6046 [56:45<00:00,  1.78 batches/s, training_loss=3.57]


Epoch 3/3: Average training loss: 3.5664
Checkpoint saved at /content/drive/MyDrive/checkpointsgpt2/checkpoint_epoch_2.pt
Training complete.


In [None]:
import os
import torch

# Define the path to save the model
model_save_path = '/content/drive/MyDrive/checkpointsgpt2'

# Create the directory if it doesn't exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Save the fine-tuned model
torch.save(model.state_dict(), os.path.join(model_save_path, 'finetuned_model2.pth'))

print("Model saved to Google Drive!")

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# Define the path to the saved model
model_save_path = '/content/drive/MyDrive/checkpointsgpt2'
model_file = 'finetuned_model.pth'

# Load the saved model
def load_saved_model(save_path, model_file):
    """
    Loads the saved model from the specified path.

    Args:
        save_path (str): The path to the saved model.
        model_file (str): The file name of the saved model.

    Returns:
        torch.nn.Module: The loaded model.
    """
    # Create a new instance of the LoRA model
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,  # Task type: causal language modeling
        inference_mode=False,          # Set to False for training mode
        r=3,                           # Reduced rank (previously 8)
        lora_alpha=16,                 # Scaled alpha (previously 32)
        lora_dropout=0.1,              # Dropout rate
        fan_in_fan_out=False           # Change parameter sharing method
    )
    model = get_peft_model(AutoModelForCausalLM.from_pretrained('gpt2-medium'), peft_config)

    # Load the saved model state dict
    state_dict = torch.load(os.path.join(save_path, model_file), map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    # Load the state dict into the model
    model.load_state_dict(state_dict)

    # Set the model to evaluation mode
    model.eval()

    return model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the saved model
loaded_model = load_saved_model(model_save_path, model_file).to(device)

print("Model loaded successfully!")

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_metric
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Initialize the ROUGE metric
rouge = load_metric("rouge")
# Define device  <-- Add this line
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')
# Evaluation
predictions = []
references = []
total_loss = 0.0
num_batches = len(test_dataloader)

with torch.no_grad():
    with tqdm(total=num_batches, desc="Evaluating") as pbar:
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate outputs
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=50, num_beams=4)
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Append to lists
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

            # Compute loss
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
            total_loss += loss.item()

            # Update tqdm progress bar with loss
            pbar.set_postfix({'Batch Loss': loss.item()})
            pbar.update(1)

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

# Extract relevant scores
rouge1_precision = results["rouge1"].high.precision
rouge1_recall = results["rouge1"].high.recall
rouge1_f1 = results["rouge1"].high.fmeasure

rouge2_precision = results["rouge2"].high.precision
rouge2_recall = results["rouge2"].high.recall
rouge2_f1 = results["rouge2"].high.fmeasure

rougel_precision = results["rougeL"].high.precision
rougel_recall = results["rougeL"].high.recall
rougel_f1 = results["rougeL"].high.fmeasure

# Calculate accuracy as an additional metric
correct = 0
total = 0
for pred, label in zip(predictions, references):
    if pred.strip() == label.strip():
        correct += 1
    total += 1
accuracy = correct / total

# Calculate partial accuracy as an additional metric
partial_correct = 0
partial_total = 0
for pred, label in zip(predictions, references):
    pred_words = pred.split()
    label_words = label.split()
    common_words = set(pred_words) & set(label_words)
    partial_correct += len(common_words)
    partial_total += len(label_words)
partial_accuracy = partial_correct / partial_total

# Print results
print(f"Total Evaluation Loss: {total_loss / num_batches:.4f}")
print(f"ROUGE-1 Precision: {rouge1_precision:.4f}, Recall: {rouge1_recall:.4f}, F1: {rouge1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge2_precision:.4f}, Recall: {rouge2_recall:.4f}, F1: {rouge2_f1:.4f}")
print(f"ROUGE-L Precision: {rougel_precision:.4f}, Recall: {rougel_recall:.4f}, F1: {rougel_f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Partial Accuracy: {partial_accuracy:.4f}")

In [None]:
pip install evaluate

In [None]:

import evaluate
# Print results
print(f"Total Evaluation Loss: {total_loss / num_batches:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Partial Accuracy: {partial_accuracy:.4f}")

# Compute BLEU scores
bleu_metric = evaluate.load("bleu")
bleu_results = bleu_metric.compute(predictions=predictions, references=references)

# Extract relevant scores
bleu_score = bleu_results["bleu"]

print(f"BLEU Score: {bleu_score:.4f}")

In [None]:

from evaluate import load

# ...

# Compute METEOR scores
meteor_metric = load("meteor")
meteor_results = meteor_metric.compute(predictions=predictions, references=references)

# Extract relevant scores
meteor_score = meteor_results["meteor"]

print(f"METEOR Score: {meteor_score:.4f}")