# Advanced LLM Training (GPT-2) on M1 Pro with MPS

## Device Configuration for Apple Silicon (MPS)

In [None]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device found. Using MPS.")
elif torch.cuda.is_available(): # Fallback for environments with CUDA
    device = torch.device("cuda")
    print("CUDA device found. Using CUDA.")
else:
    device = torch.device("cpu")
    print("MPS or CUDA not available. Using CPU.")

print(f"Selected device: {device}")

## Load and Prepare Data

In [None]:
import pandas as pd

cleaned_data_path = '../data/cleaned_watch_history.csv'
try:
    df = pd.read_csv(cleaned_data_path, parse_dates=['timestamp_utc'])
    print(f"Successfully loaded {cleaned_data_path}")
except FileNotFoundError:
    print(f"Error: The file {cleaned_data_path} was not found. Please ensure 01_data_cleaning.ipynb ran.")
    df = pd.DataFrame(columns=['title', 'video_url', 'channel_name', 'timestamp_utc'])
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])

if 'cleaned_title' not in df.columns or df['cleaned_title'].isnull().all():
    print("'cleaned_title' not found or is empty. Running preprocessing...")
    import nltk
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    try:
        word_tokenize('test')
    except LookupError:
        nltk.download('punkt', quiet=True)
    stop_words_set = set(stopwords.words('english'))
    punctuations_set = string.punctuation
    def preprocess_text_advanced(text):
        if pd.isna(text) or not text.strip(): return ""
        text = str(text).lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in punctuations_set and word.isalpha()]
        tokens = [word for word in tokens if word not in stop_words_set]
        return ' '.join(tokens)
    if 'title' in df.columns:
        df['cleaned_title'] = df['title'].apply(preprocess_text_advanced)
        print("Finished preprocessing 'title' into 'cleaned_title'.")
    else:
        print("Error: 'title' column missing. 'cleaned_title' will be empty.")
        df['cleaned_title'] = ""
else:
    print("'cleaned_title' column found.")

df.dropna(subset=['cleaned_title'], inplace=True)
df = df[df['cleaned_title'].str.strip() != '']

titles_for_training_gpt2 = df['cleaned_title'].tolist()
print(f"\nNumber of titles available for GPT-2 training: {len(titles_for_training_gpt2)}")
print("Sample titles for training:")
for title in titles_for_training_gpt2[:5]:
    print(title)

## Tokenize Text Data for GPT-2

In [None]:
from transformers import AutoTokenizer

model_name_gpt2 = 'gpt2'
tokenizer_gpt2 = None
encodings_gpt2 = {'input_ids': []}

try:
    tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name_gpt2)
    print(f"Tokenizer for '{model_name_gpt2}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer for {model_name_gpt2}: {e}")

if tokenizer_gpt2:
    if tokenizer_gpt2.pad_token is None:
        tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
        print(f"Set tokenizer_gpt2.pad_token to tokenizer_gpt2.eos_token: {tokenizer_gpt2.eos_token}")

    if titles_for_training_gpt2:
        encodings_gpt2 = tokenizer_gpt2(titles_for_training_gpt2, truncation=True, padding=True, max_length=128)
        print(f"\nTokenized {len(encodings_gpt2['input_ids'])} titles for GPT-2.")
        if encodings_gpt2['input_ids']:
             print("Example of tokenized input_ids for the first title (GPT-2):")
             print(encodings_gpt2['input_ids'][0])
    else:
        print("No titles available for GPT-2 tokenization.")
else:
    print(f"Tokenizer for {model_name_gpt2} not loaded. Cannot proceed.")

## Create PyTorch Dataset and DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader

dataset_gpt2 = None
dataloader_gpt2 = None

class YouTubeTitlesDatasetGPT2(Dataset):
    def __init__(self, tokenized_input_ids):
        self.input_ids = tokenized_input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item_input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        return {'input_ids': item_input_ids, 'labels': item_input_ids.clone()}

if encodings_gpt2 and encodings_gpt2['input_ids']:
    dataset_gpt2 = YouTubeTitlesDatasetGPT2(encodings_gpt2['input_ids'])
    print(f"\nCreated GPT-2 dataset with {len(dataset_gpt2)} samples.")
    if len(dataset_gpt2) > 0:
      sample_item = dataset_gpt2[0]
      print(f"Sample item from GPT-2 dataset: input_ids shape: {sample_item['input_ids'].shape}")
    
    dataloader_gpt2 = DataLoader(dataset_gpt2, batch_size=2, shuffle=True)
    print(f"Created GPT-2 DataLoader with batch_size=2.")
else:
    print("No tokenized data available for GPT-2 to create Dataset/DataLoader.")

## Load Pre-trained GPT-2 Model

In [None]:
from transformers import AutoModelForCausalLM

model_gpt2 = None
if 'model_name_gpt2' in globals() and model_name_gpt2:
    try:
        model_gpt2 = AutoModelForCausalLM.from_pretrained(model_name_gpt2)
        if 'device' in globals(): # Ensure device is defined
            model_gpt2.to(device)
            print(f"Pre-trained model '{model_name_gpt2}' loaded successfully and moved to {device}.")
        else:
            print(f"Pre-trained model '{model_name_gpt2}' loaded successfully, but device not defined. Model stays on CPU.")
    except Exception as e:
        print(f"Error loading pre-trained model '{model_name_gpt2}': {e}")
else:
    print("model_name_gpt2 not specified or not found. Cannot load model.")

## Fine-tuning GPT-2 on MPS

In [None]:
from torch.optim import AdamW
import os

gpt2_fine_tuning_done = False
# Ensure device, model_gpt2, and dataloader_gpt2 are available
if 'device' in globals() and 'model_gpt2' in globals() and model_gpt2 is not None and 'dataloader_gpt2' in globals() and dataloader_gpt2 is not None:
    print(f"Starting fine-tuning of GPT-2 on {device}...")
    optimizer_gpt2 = AdamW(model_gpt2.parameters(), lr=5e-5)
    num_epochs_gpt2 = 1 # Start with 1 epoch for testing on M1/MPS

    for epoch in range(num_epochs_gpt2):
        model_gpt2.train()
        total_loss_gpt2 = 0
        batch_counter = 0
        for batch in dataloader_gpt2:
            batch_counter += 1
            if batch_counter % 25 == 0: # Print progress every 25 batches
                print(f"  Epoch {epoch+1}, Batch {batch_counter}/{len(dataloader_gpt2)}")
            
            optimizer_gpt2.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs_gpt2 = model_gpt2(input_ids, labels=labels)
            loss_gpt2 = outputs_gpt2.loss

            if loss_gpt2 is not None:
                loss_gpt2.backward()
                optimizer_gpt2.step()
                total_loss_gpt2 += loss_gpt2.item()
            else:
                print(f"Warning: Loss is None for batch {batch_counter}. Skipping backward pass.")
        
        if len(dataloader_gpt2) > 0:
           avg_loss_gpt2 = total_loss_gpt2 / len(dataloader_gpt2)
           print(f"Epoch {epoch+1}/{num_epochs_gpt2} - Average Loss: {avg_loss_gpt2:.4f}")
        else:
            print(f"Epoch {epoch+1}/{num_epochs_gpt2} - DataLoader is empty.")
        
        # Optional: Synchronize MPS device at the end of epoch if needed for debugging, usually not required for basic loops.
        # if device.type == 'mps':
        #     torch.mps.synchronize()
    print("GPT-2 Fine-tuning completed.")
    gpt2_fine_tuning_done = True
else:
    print("Device, model_gpt2, or dataloader_gpt2 not available. Skipping GPT-2 fine-tuning.")

## Save Fine-tuned GPT-2 Model

In [None]:
output_dir_gpt2 = './fine_tuned_gpt2_youtube_titles'
if 'gpt2_fine_tuning_done' in globals() and gpt2_fine_tuning_done and 'model_gpt2' in globals() and model_gpt2 is not None and 'tokenizer_gpt2' in globals() and tokenizer_gpt2 is not None:
    try:
        os.makedirs(output_dir_gpt2, exist_ok=True)
        model_gpt2.save_pretrained(output_dir_gpt2)
        tokenizer_gpt2.save_pretrained(output_dir_gpt2)
        print(f"GPT-2 Model and tokenizer saved to {output_dir_gpt2}")
    except Exception as e:
        print(f"Error saving GPT-2 model/tokenizer: {e}")
elif not ('gpt2_fine_tuning_done' in globals() and gpt2_fine_tuning_done):
    print("GPT-2 fine-tuning was not performed or completed. Skipping saving model.")
else:
    print("GPT-2 Model or Tokenizer not available. Skipping saving.")

## Load Fine-tuned GPT-2 for Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer # Ensure these are imported
import torch # Ensure torch is imported
import os

# Ensure 'device' is available from the first cell, or redefine if necessary for this cell's context
if 'device' not in globals():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(f"Device for inference re-initialized to: {device}")

fine_tuned_output_dir_gpt2 = './fine_tuned_gpt2_youtube_titles' # Should match the saving directory
inference_model_gpt2 = None
inference_tokenizer_gpt2 = None

if os.path.exists(fine_tuned_output_dir_gpt2):
    try:
        inference_model_gpt2 = AutoModelForCausalLM.from_pretrained(fine_tuned_output_dir_gpt2)
        inference_tokenizer_gpt2 = AutoTokenizer.from_pretrained(fine_tuned_output_dir_gpt2)
        print(f"Fine-tuned GPT-2 model and tokenizer loaded successfully from {fine_tuned_output_dir_gpt2}.")
        
        inference_model_gpt2.to(device)
        inference_model_gpt2.eval()
        print(f"GPT-2 inference model moved to {device} and set to evaluation mode.")
    except Exception as e:
        print(f"Error loading fine-tuned GPT-2 model or tokenizer: {e}")
else:
    print(f"Fine-tuned GPT-2 model directory not found: {fine_tuned_output_dir_gpt2}. Please ensure the model was trained and saved.")

## Text Generation with Fine-tuned GPT-2 (on MPS/CPU)

In [None]:
def generate_text_gpt2(prompt, model, tokenizer, device, max_length=50, num_return_sequences=1):
    if not model or not tokenizer:
        print("Inference model or tokenizer not available for GPT-2 generation.")
        return []
    try:
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
        
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output_sequences]
        return generated_texts
    except Exception as e:
        print(f"Error during GPT-2 text generation: {e}")
        return []

In [None]:
# Ensure device is defined in this scope, or passed correctly
if 'inference_model_gpt2' in globals() and inference_model_gpt2 and 'inference_tokenizer_gpt2' in globals() and inference_tokenizer_gpt2 and 'device' in globals():
    
    prompts_gpt2 = [
        "How to build", 
        "The future of", 
        "Exploring the secrets of",
        "Advanced tutorial on",
        "Why is python"
    ]
    
    print("\n--- GPT-2 Text Generation Demo ---")
    for p in prompts_gpt2:
        generated_gpt2 = generate_text_gpt2(p, inference_model_gpt2, inference_tokenizer_gpt2, device, max_length=40)
        print(f"\nPrompt: {p}...")
        if generated_gpt2:
            for i, g in enumerate(generated_gpt2):
                print(f"Generated {i+1}: {g}")
        else:
            print("  (No text generated or error occurred)")
        print("-" * 30)
else:
    print("GPT-2 inference model, tokenizer, or device not loaded/defined. Cannot demonstrate text generation.")