# Semantic Analysis of YouTube Watch History

In [None]:
import pandas as pd

# Define the path to the cleaned data file
cleaned_data_path = '../data/cleaned_watch_history.csv'

# Load the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(cleaned_data_path, parse_dates=['timestamp_utc'])
except FileNotFoundError:
    print(f"Error: The file {cleaned_data_path} was not found. Please ensure the previous notebook ran successfully and created this file.")
    # Create an empty DataFrame with expected columns to avoid errors in subsequent cells if file not found
    df = pd.DataFrame(columns=['title', 'video_url', 'channel_name', 'timestamp_utc'])
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])

# Display the first few rows of the DataFrame
print("DataFrame head:")
print(df.head())

# Display DataFrame information
print("\nDataFrame info:")
df.info()

## Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a pleasant style for plots
plt.style.use('seaborn-v0_8-whitegrid')

### Top Watched Channels

In [None]:
if not df.empty and 'channel_name' in df.columns and not df['channel_name'].isnull().all():
    top_n_channels = 15
    channel_counts = df['channel_name'].value_counts().nlargest(top_n_channels)

    plt.figure(figsize=(10, 8))
    sns.barplot(x=channel_counts.values, y=channel_counts.index, palette='viridis')
    plt.title(f'Top {top_n_channels} Most Watched Channels')
    plt.xlabel('Number of Videos Watched')
    plt.ylabel('Channel Name')
    plt.tight_layout()
    plt.show()
else:
    print("DataFrame is empty, 'channel_name' column is missing, or all channel names are NaN. Skipping Top Watched Channels plot.")

### Viewing Activity Over Time

In [None]:
if not df.empty and 'timestamp_utc' in df.columns and pd.api.types.is_datetime64_any_dtype(df['timestamp_utc']) and not df['timestamp_utc'].isnull().all():
    # Videos per month
    df['watch_month_year'] = df['timestamp_utc'].dt.to_period('M')
    videos_per_month = df.groupby('watch_month_year').size()
    videos_per_month.index = videos_per_month.index.astype(str)
    videos_per_month = videos_per_month.sort_index()

    plt.figure(figsize=(12, 6))
    videos_per_month.plot(kind='bar', colormap='cividis')
    plt.title('Videos Watched per Month-Year')
    plt.xlabel('Month-Year')
    plt.ylabel('Number of Videos Watched')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("DataFrame is empty or 'timestamp_utc' column is missing, not datetime, or all values are NaT. Skipping Videos per month plot.")

In [None]:
if not df.empty and 'timestamp_utc' in df.columns and pd.api.types.is_datetime64_any_dtype(df['timestamp_utc']) and not df['timestamp_utc'].isnull().all():
    # Videos per hour of the day
    df['watch_hour'] = df['timestamp_utc'].dt.hour
    videos_per_hour = df.groupby('watch_hour').size().reindex(range(24), fill_value=0)

    plt.figure(figsize=(12, 6))
    videos_per_hour.plot(kind='bar', colormap='plasma')
    plt.title('Videos Watched per Hour of the Day')
    plt.xlabel('Hour of the Day (0-23)')
    plt.ylabel('Number of Videos Watched')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()
else:
    print("DataFrame is empty or 'timestamp_utc' column is missing, not datetime, or all values are NaT. Skipping Videos per hour plot.")

In [None]:
if not df.empty and 'timestamp_utc' in df.columns and pd.api.types.is_datetime64_any_dtype(df['timestamp_utc']) and not df['timestamp_utc'].isnull().all():
    # Videos per day of the week
    df['watch_dayofweek_name'] = df['timestamp_utc'].dt.day_name()
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    videos_per_day = df.groupby('watch_dayofweek_name').size().reindex(days_order, fill_value=0)

    plt.figure(figsize=(10, 6))
    videos_per_day.plot(kind='bar', colormap='summer')
    plt.title('Videos Watched per Day of the Week')
    plt.xlabel('Day of the Week')
    plt.ylabel('Number of Videos Watched')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("DataFrame is empty or 'timestamp_utc' column is missing, not datetime, or all values are NaT. Skipping Videos per day of week plot.")

## Semantic Analysis of Video Titles

### Text Preprocessing

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download nltk resources (run once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords', quiet=True)
try:
    word_tokenize('test')
except LookupError:
    nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english'))
punctuations = string.punctuation

def preprocess_text(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in punctuations and word.isalpha()] # Keep only alphabetic tokens
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

if not df.empty and 'title' in df.columns:
    df['cleaned_title_tokens'] = df['title'].apply(preprocess_text)
    df['cleaned_title'] = df['cleaned_title_tokens'].apply(lambda tokens: ' '.join(tokens))
    print("Sample cleaned titles:")
    print(df[['title', 'cleaned_title']].head())
else:
    print("DataFrame is empty or 'title' column is missing. Skipping text preprocessing.")
    # Ensure columns exist even if processing is skipped to prevent errors in later cells
    if 'cleaned_title_tokens' not in df.columns:
        df['cleaned_title_tokens'] = pd.Series(dtype='object')
    if 'cleaned_title' not in df.columns:
        df['cleaned_title'] = pd.Series(dtype='str')

### TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_matrix = None # Initialize to None
vectorizer = None # Initialize to None

if not df.empty and 'cleaned_title' in df.columns and df['cleaned_title'].astype(bool).any(): # Check if there's any actual text data
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(df['cleaned_title'].dropna()) # Drop NA to be safe
        print(f"Shape of TF-IDF matrix: {tfidf_matrix.shape}")
    except ValueError as e:
        print(f"Error during TF-IDF vectorization: {e}. This might happen if 'cleaned_title' is all empty strings or NaNs.")
else:
    print("DataFrame is empty, 'cleaned_title' column is missing, or contains no text data. Skipping TF-IDF Vectorization.")

### Topic Modeling with LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" " + " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

if tfidf_matrix is not None and vectorizer is not None:
    n_topics = 5 # Define number of topics
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    
    # Check if tfidf_matrix has any non-zero rows/documents
    if tfidf_matrix.shape[0] > 0 and tfidf_matrix.nnz > 0:
        lda.fit(tfidf_matrix)
        print(f"\nTop words for {n_topics} topics found by LDA:")
        display_topics(lda, vectorizer.get_feature_names_out(), 10)
    else:
        print("TF-IDF matrix is empty or all zero. Skipping LDA fitting. This might be due to no text data or all text data being filtered out.")
else:
    print("TF-IDF matrix or vectorizer is not available. Skipping Topic Modeling with LDA.")