# Fine-tuning DistilGPT2 on YouTube Video Titles

In [None]:
import pandas as pd

# Define the path to the cleaned data file
cleaned_data_path = '../data/cleaned_watch_history.csv'

# Load the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(cleaned_data_path, parse_dates=['timestamp_utc'])
    print(f"Successfully loaded {cleaned_data_path}")
except FileNotFoundError:
    print(f"Error: The file {cleaned_data_path} was not found. Please ensure the 01_data_cleaning notebook ran successfully.")
    df = pd.DataFrame(columns=['title', 'video_url', 'channel_name', 'timestamp_utc'])
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])

# Ensure 'cleaned_title' column exists or create it (logic adapted from notebook 03)
if 'cleaned_title' not in df.columns or df['cleaned_title'].isnull().all():
    print("'cleaned_title' not found or is empty in CSV. Running preprocessing to generate it from 'title'...")
    import nltk
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    try:
        word_tokenize('test')
    except LookupError:
        nltk.download('punkt', quiet=True)
    stop_words_set = set(stopwords.words('english'))
    punctuations_set = string.punctuation
    def preprocess_text_for_llm(text):
        if pd.isna(text) or not text.strip(): return ""
        text = str(text).lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in punctuations_set and word.isalpha()]
        tokens = [word for word in tokens if word not in stop_words_set]
        return ' '.join(tokens)
    if 'title' in df.columns:
        df['cleaned_title'] = df['title'].apply(preprocess_text_for_llm)
        print("Finished preprocessing 'title' into 'cleaned_title'.")
    else:
        print("Error: 'title' column missing, cannot generate 'cleaned_title'. Using empty strings.")
        df['cleaned_title'] = ""
else:
    print("'cleaned_title' column found in CSV.")

# Drop rows where cleaned_title is NaN or empty
df.dropna(subset=['cleaned_title'], inplace=True)
df = df[df['cleaned_title'].str.strip() != '']

titles_for_training = df['cleaned_title'].tolist()
print(f"\nNumber of titles available for training: {len(titles_for_training)}")
print("Sample titles for training:")
for title in titles_for_training[:5]:
    print(title)

## Prepare Text Data and Tokenize

In [None]:
from transformers import AutoTokenizer

model_name = 'distilgpt2' # Keep this consistent
tokenizer = None # Initialize tokenizer
encodings = {'input_ids': []} # Initialize encodings

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer for '{model_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")

if tokenizer:
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set tokenizer.pad_token to tokenizer.eos_token: {tokenizer.eos_token}")

    if titles_for_training:
        encodings = tokenizer(titles_for_training, truncation=True, padding=True, max_length=128)
        print(f"\nTokenized {len(encodings['input_ids'])} titles.")
        if encodings['input_ids']:
             print("Example of tokenized input_ids for the first title:")
             print(encodings['input_ids'][0])
    else:
        print("No titles available for tokenization.")
else:
    print("Tokenizer not loaded. Cannot proceed with tokenization.")

## Create PyTorch Dataset and DataLoader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

dataset = None # Initialize dataset
dataloader = None # Initialize dataloader

class YouTubeTitlesDataset(Dataset):
    def __init__(self, tokenized_input_ids):
        self.input_ids = tokenized_input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item_input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        return {'input_ids': item_input_ids, 'labels': item_input_ids.clone()}

if encodings and encodings['input_ids']:
    dataset = YouTubeTitlesDataset(encodings['input_ids'])
    print(f"\nCreated dataset with {len(dataset)} samples.")
    if len(dataset) > 0:
      sample_item = dataset[0]
      print(f"Sample item from dataset: input_ids shape: {sample_item['input_ids'].shape}, labels shape: {sample_item['labels'].shape}")
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    print(f"Created DataLoader with batch_size=4.")
else:
    print("No tokenized data available to create Dataset/DataLoader.")

## Load Pre-trained Model (DistilGPT2)

In [None]:
from transformers import AutoModelForCausalLM

model = None # Initialize model
if 'model_name' in globals() and model_name: # model_name defined in tokenization cell
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name)
        print(f"Pre-trained model '{model_name}' loaded successfully.")
    except Exception as e:
        print(f"Error loading pre-trained model: {e}")
else:
    print("Model name not specified or not found. Cannot load model.")

## Fine-tuning the Model

In [None]:
import torch
from torch.optim import AdamW
import os # For output_dir creation

fine_tuning_done = False # Flag to check if fine-tuning was attempted
if model and dataloader:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_epochs = 1 # Keep it short for this exercise

    print("Starting fine-tuning...")
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        batch_count = 0
        for batch in dataloader: # Ensure dataloader is not None
            batch_count += 1
            if batch_count % 50 == 0: # Print progress every 50 batches
                print(f"  Epoch {epoch+1}, Batch {batch_count}/{len(dataloader)}")
            
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            if loss is not None:
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            else:
                print("Warning: Loss is None for a batch. Skipping backward pass for this batch.")
        
        if len(dataloader) > 0:
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")
        else:
            print(f"Epoch {epoch+1}/{num_epochs} - No data in dataloader.")
    print("Fine-tuning completed.")
    fine_tuning_done = True
else:
    print("Model or DataLoader not available. Skipping fine-tuning.")

## Save Fine-tuned Model

In [None]:
output_dir = './fine_tuned_distilgpt2_youtube_titles' # Define output_dir here for clarity
if fine_tuning_done and model and tokenizer: # Check if fine-tuning was done, and model/tokenizer exist
    try:
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Model and tokenizer saved to {output_dir}")
    except Exception as e:
        print(f"Error saving model/tokenizer: {e}")
elif not fine_tuning_done:
    print("Fine-tuning was not performed or completed. Skipping saving model.")
else:
    print("Model or Tokenizer not available. Skipping saving.")

## Load Fine-tuned Model for Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

fine_tuned_output_dir = './fine_tuned_distilgpt2_youtube_titles'
inference_model = None
inference_tokenizer = None

if os.path.exists(fine_tuned_output_dir):
    try:
        inference_model = AutoModelForCausalLM.from_pretrained(fine_tuned_output_dir)
        inference_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_output_dir)
        print(f"Fine-tuned model and tokenizer loaded successfully from {fine_tuned_output_dir}.")
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        inference_model.to(device)
        inference_model.eval()
        print(f"Inference model moved to {device} and set to evaluation mode.")
    except Exception as e:
        print(f"Error loading fine-tuned model or tokenizer: {e}")
else:
    print(f"Fine-tuned model directory not found: {fine_tuned_output_dir}. Please ensure the model was trained and saved.")

## Text Generation with Fine-tuned Model

In [None]:
def generate_text(prompt, model, tokenizer, device, max_length=50, num_return_sequences=1):
    if not model or not tokenizer:
        print("Inference model or tokenizer not available for generation.")
        return []
    try:
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
        
        # Generate text sequences
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,  # To prevent repetitive phrases
            early_stopping=True,     # Stop when EOS token is generated if applicable
            pad_token_id=tokenizer.eos_token_id # Set pad_token_id to eos_token_id for open-ended generation
        )
        
        generated_texts = []
        for generated_sequence in output_sequences:
            # Decode only up to the first EOS token if present, and skip special tokens
            text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
            generated_texts.append(text)
        return generated_texts
    except Exception as e:
        print(f"Error during text generation: {e}")
        return []

In [None]:
if inference_model and inference_tokenizer:
    prompts = [
        "How to make", 
        "The best budget", 
        "Understanding",
        "DIY project for",
        "Learn python"
    ]
    
    print("\n--- Text Generation Demo ---")
    # Re-define device here if not accessible from the previous cell's scope directly in all execution contexts
    current_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

    for p in prompts:
        generated = generate_text(p, inference_model, inference_tokenizer, device=current_device, max_length=30) # Shorter max_length for titles
        print(f"\nPrompt: {p}...")
        if generated:
            for i, g in enumerate(generated):
                print(f"Generated {i+1}: {g}")
        else:
            print("  (No text generated or error occurred)")
        print("-" * 30)
else:
    print("Inference model or tokenizer not loaded. Cannot demonstrate text generation.")