# LLM Training for Virtual Double

In [None]:
import pandas as pd

# Define the path to the cleaned data file
cleaned_data_path = '../data/cleaned_watch_history.csv'

# Load the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(cleaned_data_path, parse_dates=['timestamp_utc'])
    print(f"Successfully loaded {cleaned_data_path}")
except FileNotFoundError:
    print(f"Error: The file {cleaned_data_path} was not found. Please ensure the 01_data_cleaning notebook ran successfully.")
    df = pd.DataFrame(columns=['title', 'video_url', 'channel_name', 'timestamp_utc', 'cleaned_title'])
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])

print("DataFrame head:")
print(df.head())

print("\nDataFrame info:")
df.info()

## Text Data Preparation for LLM

In [None]:
# Check if 'cleaned_title' column exists. If not, run preprocessing steps.
if 'cleaned_title' not in df.columns or df['cleaned_title'].isnull().all():
    print("'cleaned_title' not found or is empty. Running preprocessing...")
    import nltk
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # Ensure nltk resources are available
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords', quiet=True)
    try:
        word_tokenize('test') # Test if punkt is available
    except LookupError:
        nltk.download('punkt', quiet=True)

    stop_words_set = set(stopwords.words('english'))
    punctuations_set = string.punctuation

    def preprocess_text_for_llm(text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in punctuations_set and word.isalpha()] # Keep only alphabetic tokens
        tokens = [word for word in tokens if word not in stop_words_set]
        return ' '.join(tokens)
    
    if 'title' in df.columns:
        df['cleaned_title'] = df['title'].apply(preprocess_text_for_llm)
        print("Finished preprocessing 'title' into 'cleaned_title'.")
        print(df[['title', 'cleaned_title']].head())
    else:
        print("Error: 'title' column missing, cannot generate 'cleaned_title'.")
        # Ensure 'cleaned_title' exists with empty strings if title was missing, to prevent downstream errors
        if 'cleaned_title' not in df.columns:
             df['cleaned_title'] = ""
else:
    print("'cleaned_title' column found.")

In [None]:
from transformers import AutoTokenizer

model_name = 'bert-base-uncased' # Example model
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer for '{model_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    tokenizer = None

if tokenizer and 'cleaned_title' in df.columns and not df['cleaned_title'].isnull().all():
    # Ensure there are actual text strings to tokenize
    if df['cleaned_title'].astype(str).str.strip().any():
        df['tokenized_input_ids'] = df['cleaned_title'].apply(
            lambda x: tokenizer.encode(x, truncation=True, max_length=512) if pd.notna(x) and x.strip() else []
        )
        print("\nTokenized 'cleaned_title' into 'tokenized_input_ids':")
        print(df[['cleaned_title', 'tokenized_input_ids']].head())
    else:
        print("'cleaned_title' column contains no actual text to tokenize. Skipping tokenization.")
        df['tokenized_input_ids'] = pd.Series(dtype='object') # Create empty series
elif not tokenizer:
    print("Tokenizer not loaded. Skipping tokenization.")
    df['tokenized_input_ids'] = pd.Series(dtype='object') # Create empty series
else:
    print("'cleaned_title' column not found or is all null. Skipping tokenization.")
    df['tokenized_input_ids'] = pd.Series(dtype='object') # Create empty series

## LLM Model Loading and Fine-tuning (Placeholder)

In [None]:
# 1. Select an appropriate pre-trained model for sequence classification or generation
#    (e.g., AutoModelForSequenceClassification or AutoModelForCausalLM from Hugging Face)
# from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# 2. Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=8,   # batch size per device during training
#     per_device_eval_batch_size=16,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
# )

# 3. Create a custom Dataset object if necessary, or prepare data in a format
#    suitable for the Trainer API. This would involve the 'tokenized_input_ids'
#    and appropriate labels if doing supervised fine-tuning.

# 4. Initialize the Trainer
# model = AutoModelForCausalLM.from_pretrained(model_name)
# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=your_train_dataset,    # training dataset
#     eval_dataset=your_eval_dataset       # evaluation dataset (optional)
# )

# 5. Start fine-tuning
# trainer.train()

# 6. Evaluate the model
# trainer.evaluate()

# 7. Save the model
# model.save_pretrained('./fine_tuned_youtube_llm')
# tokenizer.save_pretrained('./fine_tuned_youtube_llm')