# LLM Fine-tuning on Extracted Knowledge (Summaries)

In [None]:
# Ensure transformers and torch are installed via pip if not already.
# These should be available if previous notebooks (03-06) were run with their setups.

In [None]:
import pandas as pd
import json
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # For summarization
from transformers import AutoModelForCausalLM # For GPT-2 fine-tuning on summaries

## Device Configuration for Apple Silicon (MPS)

In [None]:
if 'device_summaries' not in globals(): # Define if not already defined by a previous run of this cell
    if torch.backends.mps.is_available():
        device_summaries = torch.device("mps")
        print("MPS device found for summaries notebook. Using MPS.")
    elif torch.cuda.is_available():
        device_summaries = torch.device("cuda")
        print("CUDA device found for summaries notebook. Using CUDA.")
    else:
        device_summaries = torch.device("cpu")
        print("MPS or CUDA not available for summaries notebook. Using CPU.")
    print(f"Selected device for summaries notebook: {device_summaries}")
else:
    print(f"Device for summaries notebook already defined: {device_summaries}")

## Load Processed Video Transcripts

In [None]:
input_transcripts_file = '../data/video_transcripts.jsonl'
loaded_transcripts_list = []
try:
    with open(input_transcripts_file, 'r', encoding='utf-8') as f:
        for line in f:
            loaded_transcripts_list.append(json.loads(line))
    df_transcripts = pd.DataFrame(loaded_transcripts_list)
    if not df_transcripts.empty:
        print(f"Successfully loaded {len(df_transcripts)} transcripts from {input_transcripts_file}.")
    else:
        print(f"Loaded transcript file {input_transcripts_file}, but it resulted in an empty DataFrame.")
        df_transcripts = pd.DataFrame(columns=['video_id', 'title', 'transcript', 'language_code'])
except FileNotFoundError:
    print(f"Error: Transcript file {input_transcripts_file} not found. Please run notebook 06 first.")
    df_transcripts = pd.DataFrame(columns=['video_id', 'title', 'transcript', 'language_code'])
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from {input_transcripts_file}: {e}")
    df_transcripts = pd.DataFrame(columns=['video_id', 'title', 'transcript', 'language_code'])

if df_transcripts.empty:
    print("\nWarning: No transcripts loaded. This notebook relies on transcript data for summarization and subsequent LLM fine-tuning.")

## Transcript Summarization using Pre-trained Model

In [None]:
MAX_VIDEOS_TO_SUMMARIZE = 3
summarizer_model = None
summarizer_tokenizer = None

if 'df_transcripts' in globals() and not df_transcripts.empty:
    summarization_subset_df = df_transcripts.head(MAX_VIDEOS_TO_SUMMARIZE).copy()
    print(f"Selected {len(summarization_subset_df)} videos for summarization.")

    if not summarization_subset_df.empty:
        summarizer_model_name = 'sshleifer/distilbart-cnn-12-6'
        try:
            summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name)
            summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model_name)
            if 'device_summaries' in globals():
                summarizer_model.to(device_summaries)
                print(f"Summarizer model '{summarizer_model_name}' loaded and moved to {device_summaries}.")
            else:
                print(f"Summarizer model '{summarizer_model_name}' loaded, but device_summaries not defined. Model on CPU.")
        except Exception as e:
            print(f"Error loading summarization model/tokenizer: {e}")
            summarizer_model = None 
    else:
        print("Subset for summarization is empty. Nothing to do.")
else:
    print("df_transcripts is not available or empty. Skipping summarization model loading.")

In [None]:
video_summaries = []
max_input_length_for_summarizer = 1024

if 'summarizer_model' in globals() and summarizer_model is not None and \ 
   'summarizer_tokenizer' in globals() and summarizer_tokenizer is not None and \ 
   'summarization_subset_df' in globals() and not summarization_subset_df.empty and \ 
   'device_summaries' in globals():
    
    print(f"\nStarting summarization for {len(summarization_subset_df)} transcripts...")
    for index, row in summarization_subset_df.iterrows():
        video_id = row.get('video_id')
        title = row.get('title', 'N/A')
        transcript_text = row.get('transcript')

        if not transcript_text or pd.isna(transcript_text):
            print(f"Skipping video_id {video_id} due to missing transcript.")
            continue
        
        print(f"Summarizing video: {title} (ID: {video_id}) Transcript length: {len(transcript_text)}")
        try:
            inputs = summarizer_tokenizer.encode(
                transcript_text, 
                return_tensors="pt", 
                max_length=max_input_length_for_summarizer, 
                truncation=True
            ).to(device_summaries)
            
            summary_ids = summarizer_model.generate(
                inputs, 
                max_length=150, 
                min_length=40, 
                length_penalty=2.0, 
                num_beams=4, 
                early_stopping=True
            )
            summary_text = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            
            video_summaries.append({
                'video_id': video_id, 
                'title': title, 
                'original_transcript_length': len(transcript_text),
                'summary': summary_text
            })
            print(f"  Successfully summarized {video_id}. Summary length: {len(summary_text)}")
        except Exception as e:
            print(f"  Error summarizing {video_id}: {e}")
    print("\nSummarization process completed.")
else:
    print("Summarization prerequisites (model, tokenizer, data, or device) not met. Skipping summary generation.")

## Save Video Summaries

In [None]:
output_summaries_file = '../data/video_summaries.jsonl'
summaries_saved_count = 0
if video_summaries: 
    try:
        with open(output_summaries_file, 'w', encoding='utf-8') as f:
            for entry in video_summaries:
                if isinstance(entry, dict):
                    json.dump(entry, f, ensure_ascii=False)
                    f.write('\n')
                    summaries_saved_count += 1
        print(f"Saved {summaries_saved_count} summaries to {output_summaries_file}")
    except Exception as e:
        print(f"Error saving summaries: {e}")
else:
    print("No summaries were generated to save.")

## Load Video Summaries for GPT-2 Fine-tuning

In [None]:
import pandas as pd 
import json

input_summaries_file = '../data/video_summaries.jsonl'
loaded_summaries_list = []
try:
    with open(input_summaries_file, 'r', encoding='utf-8') as f:
        for line in f:
            loaded_summaries_list.append(json.loads(line))
    df_summaries = pd.DataFrame(loaded_summaries_list)
    if not df_summaries.empty:
        print(f"Successfully loaded {len(df_summaries)} summaries from {input_summaries_file}.")
        df_summaries.dropna(subset=['summary'], inplace=True)
        df_summaries = df_summaries[df_summaries['summary'].str.strip() != '']
        print(f"Number of summaries after filtering empty/NaN: {len(df_summaries)}")
    else:
        print(f"Loaded summaries file {input_summaries_file}, but it resulted in an empty DataFrame.")
        df_summaries = pd.DataFrame(columns=['video_id', 'title', 'original_transcript_length', 'summary'])
except FileNotFoundError:
    print(f"Error: Summaries file {input_summaries_file} not found. Please run the summarization part first.")
    df_summaries = pd.DataFrame(columns=['video_id', 'title', 'original_transcript_length', 'summary'])
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from {input_summaries_file}: {e}")
    df_summaries = pd.DataFrame(columns=['video_id', 'title', 'original_transcript_length', 'summary'])

## Tokenize Summaries for GPT-2

In [None]:
from transformers import AutoTokenizer 

encodings_summaries_input_ids = [] 
tokenizer_gpt2_summaries = None

if 'df_summaries' in globals() and not df_summaries.empty and 'summary' in df_summaries.columns:
    summary_list = df_summaries['summary'].tolist()
    if summary_list:
        try:
            tokenizer_gpt2_summaries = AutoTokenizer.from_pretrained('gpt2')
            if tokenizer_gpt2_summaries.pad_token is None:
                tokenizer_gpt2_summaries.pad_token = tokenizer_gpt2_summaries.eos_token
                print(f"Set pad_token for GPT-2 summaries tokenizer to: {tokenizer_gpt2_summaries.eos_token}")

            encodings_summaries = tokenizer_gpt2_summaries(summary_list, truncation=True, padding=True, max_length=256) 
            encodings_summaries_input_ids = encodings_summaries['input_ids']
            print(f"Tokenized {len(encodings_summaries_input_ids)} summaries.")
            if encodings_summaries_input_ids:
                print(f"Example tokenized summary (first 10 tokens): {encodings_summaries_input_ids[0][:10]}")
        except Exception as e:
            print(f"Error during summary tokenization: {e}")
    else:
        print("Summary list is empty after filtering. No summaries to tokenize.")
else:
    print("DataFrame of summaries is empty or 'summary' column is missing.")

## Create PyTorch Dataset and DataLoader for Summaries

In [None]:
import torch 
from torch.utils.data import Dataset, DataLoader

dataset_gpt2_summaries = None
dataloader_gpt2_summaries = None

class SummaryDatasetGPT2(Dataset):
    def __init__(self, tokenized_input_ids_list):
        self.input_ids_list = tokenized_input_ids_list

    def __len__(self):
        return len(self.input_ids_list)

    def __getitem__(self, idx):
        item = self.input_ids_list[idx]
        if not isinstance(item, list):
            print(f"Warning: Item at index {idx} is not a list, but {type(item)}. Skipping this item.")
            return {'input_ids': torch.tensor([], dtype=torch.long), 
                    'labels': torch.tensor([], dtype=torch.long)}
        
        return {'input_ids': torch.tensor(item, dtype=torch.long), 
                'labels': torch.tensor(item, dtype=torch.long).clone()}

if 'encodings_summaries_input_ids' in globals() and encodings_summaries_input_ids: 
    dataset_gpt2_summaries = SummaryDatasetGPT2(encodings_summaries_input_ids)
    print(f"Created summary dataset with {len(dataset_gpt2_summaries)} samples.")

    if len(dataset_gpt2_summaries) > 0:
        dataloader_gpt2_summaries = DataLoader(dataset_gpt2_summaries, batch_size=1, shuffle=True)
        print(f"Created summary DataLoader with batch_size=1.")
    else:
        print("Summary dataset is empty. DataLoader not created.")
else:
    print("No tokenized summaries available. Cannot create Dataset/DataLoader for summaries.")

## Load Pre-trained GPT-2 Model for Summary Fine-tuning

In [None]:
from transformers import AutoModelForCausalLM 

model_name_gpt2_summaries = 'gpt2'
model_gpt2_summaries = None 

try:
    model_gpt2_summaries = AutoModelForCausalLM.from_pretrained(model_name_gpt2_summaries)
    if 'device_summaries' in globals(): 
        model_gpt2_summaries.to(device_summaries)
        print(f"Pre-trained model '{model_name_gpt2_summaries}' loaded and moved to {device_summaries}.")
    else:
        print(f"Pre-trained model '{model_name_gpt2_summaries}' loaded, but device_summaries not defined. Model remains on CPU.")
except Exception as e:
    print(f"Error loading pre-trained model '{model_name_gpt2_summaries}': {e}")

## Fine-tuning GPT-2 on Video Summaries

In [None]:
from torch.optim import AdamW 

gpt2_summary_fine_tuning_done = False 

if 'device_summaries' in globals() and \ 
   'model_gpt2_summaries' in globals() and model_gpt2_summaries is not None and \ 
   'dataloader_gpt2_summaries' in globals() and dataloader_gpt2_summaries is not None:
    
    print(f"Starting fine-tuning of GPT-2 on summaries using {device_summaries}...")
    optimizer_gpt2_summaries = AdamW(model_gpt2_summaries.parameters(), lr=5e-5)
    num_epochs_gpt2_summaries = 3 

    model_gpt2_summaries.train()
    for epoch in range(num_epochs_gpt2_summaries):
        total_loss_summaries = 0
        print(f"Starting Summary Fine-Tuning Epoch {epoch+1}/{num_epochs_gpt2_summaries}")
        for batch_idx, batch in enumerate(dataloader_gpt2_summaries):
            optimizer_gpt2_summaries.zero_grad()
            input_ids = batch['input_ids'].to(device_summaries)
            labels = batch['labels'].to(device_summaries)
            
            outputs = model_gpt2_summaries(input_ids, labels=labels)
            loss = outputs.loss
            
            if loss is not None:
                loss.backward()
                optimizer_gpt2_summaries.step()
                total_loss_summaries += loss.item()
                if batch_idx % 10 == 0: 
                   print(f"  Epoch {epoch+1}, Batch {batch_idx}/{len(dataloader_gpt2_summaries)}, Loss: {loss.item():.4f}")
            else:
                print(f"Warning: Loss is None for summary batch {batch_idx}. Skipping.")
        
        avg_loss_summaries = total_loss_summaries / len(dataloader_gpt2_summaries) if len(dataloader_gpt2_summaries) > 0 else 0
        print(f"Epoch {epoch+1}/{num_epochs_gpt2_summaries} - Average Training Loss on Summaries: {avg_loss_summaries:.4f}")
    
    print("Fine-tuning GPT-2 on summaries completed.")
    gpt2_summary_fine_tuning_done = True
else:
    print("Required variables (device, model, dataloader for summaries) not available. Skipping fine-tuning on summaries.")

## Save Summary-Tuned GPT-2 Model

In [None]:
import os 
output_dir_gpt2_summaries = './fine_tuned_gpt2_youtube_summaries'

if 'gpt2_summary_fine_tuning_done' in globals() and gpt2_summary_fine_tuning_done and \ 
   'model_gpt2_summaries' in globals() and model_gpt2_summaries is not None and \ 
   'tokenizer_gpt2_summaries' in globals() and tokenizer_gpt2_summaries is not None:
    try:
        os.makedirs(output_dir_gpt2_summaries, exist_ok=True)
        model_gpt2_summaries.save_pretrained(output_dir_gpt2_summaries)
        tokenizer_gpt2_summaries.save_pretrained(output_dir_gpt2_summaries)
        print(f"Summary-tuned GPT-2 Model and tokenizer saved to {output_dir_gpt2_summaries}")
    except Exception as e:
        print(f"Error saving summary-tuned GPT-2 model/tokenizer: {e}")
elif not ('gpt2_summary_fine_tuning_done' in globals() and gpt2_summary_fine_tuning_done):
    print("Summary fine-tuning was not performed or completed. Skipping saving model.")
else:
    print("Summary-tuned GPT-2 Model or its Tokenizer not available. Skipping saving.")

## Load Summary-Tuned GPT-2 for Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer # Ensure imports
import torch
import os

fine_tuned_output_dir_summaries = './fine_tuned_gpt2_youtube_summaries' # Matches saving directory
inference_model_summaries = None
inference_tokenizer_summaries = None

# Ensure device_summaries is available (it's defined in an early cell)
if 'device_summaries' not in globals():
    if torch.backends.mps.is_available():
        device_summaries = torch.device("mps")
        print("Re-initialized MPS device for inference.")
    elif torch.cuda.is_available():
        device_summaries = torch.device("cuda")
        print("Re-initialized CUDA device for inference.")
    else:
        device_summaries = torch.device("cpu")
        print("Re-initialized CPU device for inference.")

if os.path.exists(fine_tuned_output_dir_summaries):
    try:
        inference_model_summaries = AutoModelForCausalLM.from_pretrained(fine_tuned_output_dir_summaries)
        inference_tokenizer_summaries = AutoTokenizer.from_pretrained(fine_tuned_output_dir_summaries)
        print(f"Fine-tuned summary model and tokenizer loaded from {fine_tuned_output_dir_summaries}.")
        
        inference_model_summaries.to(device_summaries)
        inference_model_summaries.eval()
        print(f"Summary inference model moved to {device_summaries} and set to eval mode.")
    except Exception as e:
        print(f"Error loading fine-tuned summary model or tokenizer: {e}")
else:
    print(f"Directory not found: {fine_tuned_output_dir_summaries}. Ensure model was trained and saved.")

## Text Generation with Summary-Tuned GPT-2

In [None]:
def generate_text_from_summary_model(prompt, model, tokenizer, device, max_length=100, num_return_sequences=1):
    if not model or not tokenizer:
        print("Inference model or tokenizer for summaries not available.")
        return []
    try:
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
        
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            min_length=20, # Encourage slightly longer than just a few words
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            num_beams=5, # Beam search can produce more coherent text
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.8, 
            top_k=50
        )
        
        generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output_sequences]
        return generated_texts
    except Exception as e:
        print(f"Error during text generation with summary-tuned model: {e}")
        return []

In [None]:
if 'inference_model_summaries' in globals() and inference_model_summaries and \ 
   'inference_tokenizer_summaries' in globals() and inference_tokenizer_summaries and \ 
   'device_summaries' in globals():
    
    prompts_for_summaries = [
        "The main topic of this video is", 
        "Key takeaways include", 
        "This video explains how to",
        "An interesting point made was about",
        "To learn more about this, you should"
    ]
    
    print("\n--- Summary-Tuned GPT-2 Text Generation Demo ---")
    for p in prompts_for_summaries:
        generated_texts_summary_model = generate_text_from_summary_model(
            p, 
            inference_model_summaries, 
            inference_tokenizer_summaries, 
            device_summaries, 
            max_length=75 # Keep generated sequence length reasonable for summary-like output
        )
        print(f"\nPrompt: {p}...")
        if generated_texts_summary_model:
            for i, g in enumerate(generated_texts_summary_model):
                print(f"Generated {i+1}: {g}")
        else:
            print("  (No text generated or error occurred)")
        print("-" * 30)
else:
    print("Summary-tuned inference model, tokenizer, or device not loaded/defined. Cannot demonstrate generation.")