# LLM Fine-tuning on YouTube Video Transcripts

In [None]:
# This cell should be run if 'youtube-transcript-api' is not already installed in your environment.
!pip install youtube-transcript-api

In [None]:
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound # Updated import
import json
import os
import re # For basic cleaning
import time

## Load Video Watch History

In [None]:
cleaned_data_path = '../data/cleaned_watch_history.csv'
try:
    df_history = pd.read_csv(cleaned_data_path, parse_dates=['timestamp_utc'])
    print(f"Successfully loaded {cleaned_data_path}")
    print(f"DataFrame shape: {df_history.shape}")
except FileNotFoundError:
    print(f"Error: The file {cleaned_data_path} was not found. Please ensure 01_data_cleaning.ipynb ran.")
    df_history = pd.DataFrame(columns=['title', 'video_url', 'channel_name', 'timestamp_utc'])

## Fetch Video Transcripts (Limited Subset)

In [None]:
MAX_VIDEOS_TO_PROCESS = 5 
transcript_data = []
ytt_api = YouTubeTranscriptApi() # Initialize API client

def get_video_id(url):
    if pd.isna(url): return None
    if 'watch?v=' in url:
        return url.split('watch?v=')[-1].split('&')[0]
    elif 'youtu.be/' in url:
        return url.split('youtu.be/')[-1].split('?')[0]
    return None

if not df_history.empty:
    print(f"Starting transcript fetching for up to {MAX_VIDEOS_TO_PROCESS} videos...")
    for index, row in df_history.head(MAX_VIDEOS_TO_PROCESS).iterrows():
        video_url = row.get('video_url')
        title = row.get('title', 'N/A') 
        video_id = get_video_id(video_url)

        if not video_id:
            print(f"Could not extract video_id from URL: {video_url}")
            continue

        print(f"\nProcessing video: {title} (ID: {video_id})")
        try:
            # Attempt to fetch the transcript (will get the best available or default based on languages like 'en', 'fr')
            # The fetch method itself handles finding a suitable transcript from the list.
            # It returns a list of transcript parts (dictionaries with 'text', 'start', 'duration').
            transcript_parts = ytt_api.get_transcript(video_id, languages=['en', 'fr']) # Prioritize en, then fr
            
            if transcript_parts:
                full_transcript_text = " ".join([part['text'] for part in transcript_parts])
                full_transcript_text = re.sub(r'\n+', ' ', full_transcript_text)
                full_transcript_text = re.sub(r'\s+', ' ', full_transcript_text).strip()
                
                # Note: ytt_api.get_transcript doesn't directly return a language code for the fetched transcript in the same way list_transcripts().find_generated_transcript() did.
                # We might infer it or try list_transcripts first if language code is strictly needed alongside direct fetch.
                # For simplicity here, we'll assume it got one of the requested languages if successful.
                # If specific language_code is critical, the previous method of iterating list_transcripts is more explicit.
                # Here, we're simplifying to the direct fetch method.
                # We can try to get the language from the list_transcripts object if needed, but fetch() is more direct.
                # Let's try to get the language code by listing first to keep that data point.
                lang_code = 'unknown' # Default
                try:
                    transcript_list_for_lang = YouTubeTranscriptApi.list_transcripts(video_id)
                    for lang_pref in ['en', 'fr']:
                        found_lang_manual = transcript_list_for_lang.find_manually_created_transcript([lang_pref])
                        lang_code = found_lang_manual.language_code
                        break
                    if lang_code == 'unknown': # if no manual, try generated
                         for lang_pref in ['en', 'fr']:
                            found_lang_generated = transcript_list_for_lang.find_generated_transcript([lang_pref])
                            lang_code = found_lang_generated.language_code
                            break
                except NoTranscriptFound:
                    pass # lang_code remains 'unknown'
                except Exception:
                    pass # lang_code remains 'unknown'

                transcript_data.append({
                    'video_id': video_id, 
                    'title': title, 
                    'transcript': full_transcript_text,
                    'language_code': lang_code 
                })
                print(f"Fetched transcript for {video_id} (Attempted Langs: en, fr; Detected: {lang_code})")
            else:
                print(f"No transcript content returned by get_transcript for {video_id}")

        except TranscriptsDisabled:
            print(f"Transcripts disabled for {video_id}")
        except NoTranscriptFound: 
            print(f"No transcript found by get_transcript for {video_id} in specified languages.")
        except Exception as e:
            print(f"Error fetching transcript for {video_id} using get_transcript(): {e}")
        
        time.sleep(0.5)
    print("\nFinished transcript fetching process.")
else:
    print("Watch history DataFrame is empty. No transcripts to fetch.")

## Save Collected Transcripts

In [None]:
output_transcripts_file = '../data/video_transcripts.jsonl'
processed_count = 0
try:
    with open(output_transcripts_file, 'w', encoding='utf-8') as f:
        for entry in transcript_data:
            if isinstance(entry, dict):
                json.dump(entry, f, ensure_ascii=False)
                f.write('\n')
                processed_count +=1
    print(f"Saved {processed_count} transcripts to {output_transcripts_file}")
except Exception as e:
    print(f"Error saving transcripts: {e}")

## Load Saved Transcripts

In [None]:
import pandas as pd
import json

input_transcripts_file = '../data/video_transcripts.jsonl'
loaded_transcripts_list = []
try:
    with open(input_transcripts_file, 'r', encoding='utf-8') as f:
        for line in f:
            loaded_transcripts_list.append(json.loads(line))
    df_transcripts = pd.DataFrame(loaded_transcripts_list)
    print(f"Successfully loaded {len(df_transcripts)} transcripts from {input_transcripts_file}.")
except FileNotFoundError:
    print(f"Error: Transcript file {input_transcripts_file} not found. Please run the fetching part first.")
    df_transcripts = pd.DataFrame(columns=['video_id', 'title', 'transcript', 'language_code']) 
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from {input_transcripts_file}: {e}")
    df_transcripts = pd.DataFrame(columns=['video_id', 'title', 'transcript', 'language_code'])

## Prepare Transcript Text for GPT-2

In [None]:
from transformers import AutoTokenizer

chunked_input_ids = [] 
tokenizer_gpt2_transcripts = None 

if not df_transcripts.empty and 'transcript' in df_transcripts.columns:
    df_transcripts_valid = df_transcripts[df_transcripts['transcript'].str.len() > 50].copy()
    print(f"Number of transcripts after filtering short ones: {len(df_transcripts_valid)}")

    if not df_transcripts_valid.empty:
        full_text_corpus = "\n\n".join(df_transcripts_valid['transcript'].tolist())
        try:
            tokenizer_gpt2_transcripts = AutoTokenizer.from_pretrained('gpt2')
            if tokenizer_gpt2_transcripts.pad_token is None:
                tokenizer_gpt2_transcripts.pad_token = tokenizer_gpt2_transcripts.eos_token
                print(f"Set pad_token for GPT-2 transcript tokenizer to: {tokenizer_gpt2_transcripts.eos_token}")

            tokenized_corpus = tokenizer_gpt2_transcripts.encode(full_text_corpus)
            print(f"Total tokens in corpus: {len(tokenized_corpus)}")

            max_seq_length = 128 
            for i in range(0, len(tokenized_corpus) - max_seq_length + 1, max_seq_length):
                chunked_input_ids.append(tokenized_corpus[i : i + max_seq_length])
            print(f"Number of chunks created: {len(chunked_input_ids)}")
            if chunked_input_ids:
                print(f"Example chunk (first 10 tokens): {chunked_input_ids[0][:10]}")
        except Exception as e:
            print(f"Error during tokenization or chunking: {e}")
    else:
        print("No valid transcripts long enough for processing after filtering.")
else:
    print("Transcript DataFrame is empty or 'transcript' column is missing.")

## Create PyTorch Dataset and DataLoader for Transcripts

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

dataset_transcripts_gpt2 = None
dataloader_transcripts_gpt2 = None

class TranscriptDatasetGPT2(Dataset):
    def __init__(self, chunked_input_ids):
        self.chunked_input_ids = chunked_input_ids

    def __len__(self):
        return len(self.chunked_input_ids)

    def __getitem__(self, idx):
        chunk = self.chunked_input_ids[idx]
        return {'input_ids': torch.tensor(chunk, dtype=torch.long), 
                'labels': torch.tensor(chunk, dtype=torch.long).clone()}

if chunked_input_ids: 
    dataset_transcripts_gpt2 = TranscriptDatasetGPT2(chunked_input_ids)
    print(f"Created transcript dataset with {len(dataset_transcripts_gpt2)} samples (chunks).")

    if len(dataset_transcripts_gpt2) > 0:
        dataloader_transcripts_gpt2 = DataLoader(dataset_transcripts_gpt2, batch_size=1, shuffle=True)
        print(f"Created transcript DataLoader with batch_size=1.")
    else:
        print("Dataset for transcripts is empty, DataLoader not created.")
else:
    print("No chunked input_ids available. Cannot create Dataset/DataLoader for transcripts.")

## Device Configuration for Apple Silicon (MPS)

In [None]:
import torch 

if 'device_transcripts' not in globals(): # Define if not already defined by a previous run of this cell
    if torch.backends.mps.is_available():
        device_transcripts = torch.device("mps")
        print("MPS device found for transcript training. Using MPS.")
    elif torch.cuda.is_available():
        device_transcripts = torch.device("cuda")
        print("CUDA device found for transcript training. Using CUDA.")
    else:
        device_transcripts = torch.device("cpu")
        print("MPS or CUDA not available for transcript training. Using CPU.")
    print(f"Selected device for transcript training: {device_transcripts}")
else:
    print(f"Device for transcript training already defined: {device_transcripts}")

## Load Pre-trained GPT-2 Model for Transcript Fine-tuning

In [None]:
from transformers import AutoModelForCausalLM

model_name_gpt2_transcripts = 'gpt2' 
model_gpt2_transcripts = None

try:
    model_gpt2_transcripts = AutoModelForCausalLM.from_pretrained(model_name_gpt2_transcripts)
    if 'device_transcripts' in globals(): 
        model_gpt2_transcripts.to(device_transcripts)
        print(f"Pre-trained model '{model_name_gpt2_transcripts}' loaded and moved to {device_transcripts}.")
    else:
        print(f"Pre-trained model '{model_name_gpt2_transcripts}' loaded, but device_transcripts not defined. Model on CPU.")
except Exception as e:
    print(f"Error loading pre-trained model '{model_name_gpt2_transcripts}': {e}")

## Fine-tuning GPT-2 on Transcripts

In [None]:
from torch.optim import AdamW

gpt2_transcript_fine_tuning_done = False
if 'device_transcripts' in globals() and \ 
   'model_gpt2_transcripts' in globals() and model_gpt2_transcripts is not None and \ 
   'dataloader_transcripts_gpt2' in globals() and dataloader_transcripts_gpt2 is not None:

    print(f"Starting fine-tuning of GPT-2 on transcripts using {device_transcripts}...")
    optimizer_gpt2_transcripts = AdamW(model_gpt2_transcripts.parameters(), lr=5e-5)
    num_epochs_gpt2_transcripts = 1 

    model_gpt2_transcripts.train()
    for epoch in range(num_epochs_gpt2_transcripts):
        total_loss_transcripts = 0
        print(f"Starting Epoch {epoch+1}/{num_epochs_gpt2_transcripts}")
        for batch_idx, batch in enumerate(dataloader_transcripts_gpt2):
            optimizer_gpt2_transcripts.zero_grad()
            input_ids = batch['input_ids'].to(device_transcripts)
            labels = batch['labels'].to(device_transcripts)
            
            outputs = model_gpt2_transcripts(input_ids, labels=labels)
            loss = outputs.loss
            
            if loss is not None:
                loss.backward()
                optimizer_gpt2_transcripts.step()
                total_loss_transcripts += loss.item()
                if batch_idx % 20 == 0: 
                   print(f"  Epoch {epoch+1}, Batch {batch_idx}/{len(dataloader_transcripts_gpt2)}, Loss: {loss.item():.4f}")
            else:
                print(f"Warning: Loss is None for batch {batch_idx}. Skipping.")

        avg_loss_transcripts = total_loss_transcripts / len(dataloader_transcripts_gpt2) if len(dataloader_transcripts_gpt2) > 0 else 0
        print(f"Epoch {epoch+1}/{num_epochs_gpt2_transcripts} - Average Training Loss: {avg_loss_transcripts:.4f}")
    
    print("Fine-tuning GPT-2 on transcripts completed.")
    gpt2_transcript_fine_tuning_done = True
else:
    print("Required variables (device, model, dataloader for transcripts) not available. Skipping fine-tuning.")

## Save Transcript-Tuned GPT-2 Model

In [None]:
import os 
output_dir_gpt2_transcripts = './fine_tuned_gpt2_youtube_transcripts'

if 'gpt2_transcript_fine_tuning_done' in globals() and gpt2_transcript_fine_tuning_done and \ 
   'model_gpt2_transcripts' in globals() and model_gpt2_transcripts is not None and \ 
   'tokenizer_gpt2_transcripts' in globals() and tokenizer_gpt2_transcripts is not None:
    try:
        os.makedirs(output_dir_gpt2_transcripts, exist_ok=True)
        model_gpt2_transcripts.save_pretrained(output_dir_gpt2_transcripts)
        tokenizer_gpt2_transcripts.save_pretrained(output_dir_gpt2_transcripts)
        print(f"Transcript-tuned GPT-2 Model and tokenizer saved to {output_dir_gpt2_transcripts}")
    except Exception as e:
        print(f"Error saving transcript-tuned GPT-2 model/tokenizer: {e}")
elif not ('gpt2_transcript_fine_tuning_done' in globals() and gpt2_transcript_fine_tuning_done):
    print("Transcript fine-tuning was not performed or completed. Skipping saving model.")
else:
    print("Transcript-tuned GPT-2 Model or its Tokenizer not available. Skipping saving.")

## Load Transcript-Tuned GPT-2 for Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer 
import torch
import os

fine_tuned_output_dir_transcripts = './fine_tuned_gpt2_youtube_transcripts' 
inference_model_transcripts = None
inference_tokenizer_transcripts = None

if 'device_transcripts' not in globals():
    if torch.backends.mps.is_available():
        device_transcripts = torch.device("mps")
        print("Re-initialized MPS device for inference.")
    elif torch.cuda.is_available():
        device_transcripts = torch.device("cuda")
        print("Re-initialized CUDA device for inference.")
    else:
        device_transcripts = torch.device("cpu")
        print("Re-initialized CPU device for inference.")

if os.path.exists(fine_tuned_output_dir_transcripts):
    try:
        inference_model_transcripts = AutoModelForCausalLM.from_pretrained(fine_tuned_output_dir_transcripts)
        inference_tokenizer_transcripts = AutoTokenizer.from_pretrained(fine_tuned_output_dir_transcripts)
        print(f"Fine-tuned transcript model and tokenizer loaded from {fine_tuned_output_dir_transcripts}.")
        
        inference_model_transcripts.to(device_transcripts)
        inference_model_transcripts.eval()
        print(f"Transcript inference model moved to {device_transcripts} and set to eval mode.")
    except Exception as e:
        print(f"Error loading fine-tuned transcript model or tokenizer: {e}")
else:
    print(f"Directory not found: {fine_tuned_output_dir_transcripts}. Ensure model was saved.")

## Text Generation with Transcript-Tuned GPT-2

In [None]:
def generate_text_from_transcripts_model(prompt, model, tokenizer, device, max_length=100, num_return_sequences=1):
    if not model or not tokenizer:
        print("Inference model or tokenizer for transcripts not available.")
        return []
    try:
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
        
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2, 
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.7, 
            top_k=50         
        )
        
        generated_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in output_sequences]
        return generated_texts
    except Exception as e:
        print(f"Error during text generation with transcript model: {e}")
        return []

In [None]:
if 'inference_model_transcripts' in globals() and inference_model_transcripts and \ 
   'inference_tokenizer_transcripts' in globals() and inference_tokenizer_transcripts and \ 
   'device_transcripts' in globals():
    
    prompts_for_transcripts = [
        "Today we're going to talk about", 
        "The key thing to remember is", 
        "What if I told you that",
        "In this video, I will show you",
        "Let's dive into the world of"
    ]
    
    print("\n--- Transcript-Tuned GPT-2 Text Generation Demo ---")
    for p in prompts_for_transcripts:
        generated_texts = generate_text_from_transcripts_model(
            p, 
            inference_model_transcripts, 
            inference_tokenizer_transcripts, 
            device_transcripts, 
            max_length=75 
        )
        print(f"\nPrompt: {p}...")
        if generated_texts:
            for i, g in enumerate(generated_texts):
                print(f"Generated {i+1}: {g}")
        else:
            print("  (No text generated or error occurred)")
        print("-" * 30)
else:
    print("Transcript-tuned inference model, tokenizer, or device not loaded/defined. Cannot demonstrate generation.")