# Import Packages and Data

## Load in Packages

In [1]:
# import libraries to navigate within directories
import os
import sys
import re

# the usual suspects
import pandas as pd
import numpy as np

# NLP libraries
import nltk

# NLP pre-processing from nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# let's make pre-processing more efficient
## batch processing
from typing import List
## parallel processing
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

# Jupyter-specific imports
from IPython.display import clear_output
%matplotlib inline 

# Suppress warnings (optional)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\felic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\felic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\felic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\felic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [2]:
df_metadata_transcript = pd.read_csv('metadata_transcript.csv')
df_metadata_transcript.head(5)

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
0,Black Girls Code,Have You Ever Thought About Working at Google?...,https://www.youtube.com/watch?v=ahQQpYG0Lhk,4/26/24,6:26,199330,5625,4,haveyoueverthoughtaboutworkingatgoogleepisode8...,1,hello everyone and welcome back to codal along...
1,Black Girls Code,Tutorial 4: How to Code Animated Art In JavaSc...,https://www.youtube.com/watch?v=I49CXUIXvM8,4/28/23,11:18,84000,179,8,tutorial4howtocodeanimatedartinjavascriptcodea...,1,what's up for the encoders welcome back to the...
2,Black Girls Code,CODE Along: Win $2500!,https://www.youtube.com/watch?v=fZKyNSH2mmM,8/25/23,2:19,139194,344,18,codealongwin2500.txt,1,hey what's up everyone I'm Cheyenne and I'm yo...
3,Black Girls Code,Don't Know What Career to Choose? Actress and ...,https://www.youtube.com/watch?v=ohceCkLK8Wo,8/21/23,0:44,1143,40,2,dontknowwhatcareertochooseactressandgamerellab...,1,this question means so much to me because when...
4,Black Girls Code,Watch Actress Ella Balinska Recap Her Career i...,https://www.youtube.com/watch?v=jAYK6vJbhxU,8/19/23,0:48,3493,117,10,watchactressellabalinskarecaphercareerin60seco...,1,60 seconds okay so I started off uh with a cou...


# Text Pre-Processing
We employ different methods to pre-process the text before using NLP to analyze it:
- Lowercasing ---> seems like we've done this!
- Tokenization, as a pre-processing step for Term Frequency Distributions
- Remove Stop Words!
- Remove leftover contraction tokens ("'s", "'m", etc...)
- Lemmatization, to simplify tokens further by removing plurals and such

Takes a long time to run... 
- VF Record: 3m 43.8s, 11/28/24 11:52pm WOW WHAT HAPPENED HERE WHY IS IT BETTER
- VF Longest Run: 12m 27.6s, 11/14/24 10:15am

In [3]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define custom filler words
custom_fillers = {
    'uh', 'um', 'na', 'like', 'actually', 'basically', 'literally',
    'you know', 'i mean', 'sort of', 'kind of', 'yeah', 'okay',
    'right', 'well', 'so', 'just', 'going', 'oh', 'ah', 'gon'
} # i just add onto this when i see something repetitive or weird
# Pre-compute lowercase stop words and unwanted tokens
stop_words_lower = {word.lower() for word in stop_words}
all_filtered_words = stop_words_lower.union(custom_fillers)
unwanted_tokens = {"'s", "'m", "'re", "'ve", "'d", "'ll", "n't"}
all_filtered_words.update(unwanted_tokens)

def optimized_preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Combine operations
    tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in word_tokenize(text)
        if token.lower() not in all_filtered_words
    ]
    return tokens

# Apply preprocessing to the 'content' column
df_metadata_transcript['preprocessed_content'] = df_metadata_transcript['content'].apply(optimized_preprocess_text)

# Verify columns are preserved
print("Columns in DataFrame:", df_metadata_transcript.columns.tolist())
print("\nFirst few rows of content and preprocessed content:")
print(df_metadata_transcript[['content', 'preprocessed_content']].head())

Columns in DataFrame: ['channel_name', 'video_title', 'video_url', 'publish_date', 'video_length', 'views', 'likes', 'comments', 'transcript_filename', 'Tier (1, 2, or 3)', 'content', 'preprocessed_content']

First few rows of content and preprocessed content:
                                             content  \
0  hello everyone and welcome back to codal along...   
1  what's up for the encoders welcome back to the...   
2  hey what's up everyone I'm Cheyenne and I'm yo...   
3  this question means so much to me because when...   
4  60 seconds okay so I started off uh with a cou...   

                                preprocessed_content  
0  [hello, everyone, welcome, back, codal, along,...  
1  [whats, encoders, welcome, back, fourth, final...  
2  [hey, whats, everyone, im, cheyenne, im, resid...  
3  [question, mean, much, whenever, anything, mak...  
4  [second, started, couple, writer, pass, tv, sh...  


# Get Summary Statistics per YouTube Channel, per YouTube Video. Specifically, we'll get the Total Words & Unique Words across all dimensions.

In [4]:
# Get statistics from the preprocessed DataFrame
def get_video_statistics(df):
    video_stats = {}
    for _, row in df.iterrows():
        words = row['preprocessed_content']  # Already a list, no need to split
        video_stats[row['video_title']] = {
            'total_words': len(words),
            'unique_words': len(set(words))
        }
    return video_stats

def get_channel_statistics(df):
    channel_stats = {}
    
    # Group by channel
    for channel_name, channel_group in df.groupby('channel_name'):
        all_words = []
        unique_words = set()
        
        for word_list in channel_group['preprocessed_content']:
            all_words.extend(word_list)
            unique_words.update(word_list)
            
        channel_stats[channel_name] = {
            'total_words': len(all_words),
            'unique_words': len(unique_words),
            'video_count': len(channel_group)
        }
    return channel_stats

def get_corpus_statistics(df):
    """Calculate statistics for the entire corpus"""
    all_words = []
    unique_words = set()
    
    for word_list in df['preprocessed_content']:
        all_words.extend(word_list)
        unique_words.update(word_list)
    
    return {
        'total_words': len(all_words),
        'unique_words': len(unique_words),
        'total_videos': len(df),
        'total_channels': len(df['channel_name'].unique())
    }

# Calculate statistics
video_statistics = get_video_statistics(df_metadata_transcript)
channel_statistics = get_channel_statistics(df_metadata_transcript)
corpus_statistics = get_corpus_statistics(df_metadata_transcript)

# Convert video statistics to DataFrame
video_stats_df = pd.DataFrame.from_dict(video_statistics, orient='index')
video_stats_df.index.name = 'video_title'
video_stats_df = video_stats_df.reset_index()

# Merge statistics with original DataFrame
df_metadata_transcript = df_metadata_transcript.merge(
    video_stats_df, 
    on='video_title', 
    how='left'
)

# Calculate channel statistics DataFrame
channel_stats_df = pd.DataFrame.from_dict(channel_statistics, orient='index')

# Display results
print("\nCorpus-Level Statistics:")
print("-" * 50)
for key, value in corpus_statistics.items():
    print(f"{key}: {value:,}")

print("\nUpdated DataFrame with word statistics:")
print(df_metadata_transcript[['video_title', 'channel_name', 'total_words', 'unique_words']].head())

print("\nPer Channel Statistics:")
print(channel_stats_df.sort_values('total_words', ascending=False))

print("\nSummary Statistics:")
print("\nVideo Level:")
print(df_metadata_transcript[['total_words', 'unique_words']].describe())
print("\nChannel Level:")
print(channel_stats_df.describe())


Corpus-Level Statistics:
--------------------------------------------------
total_words: 36,516,986
unique_words: 128,028
total_videos: 16,206
total_channels: 65

Updated DataFrame with word statistics:
                                         video_title      channel_name  \
0  Have You Ever Thought About Working at Google?...  Black Girls Code   
1  Tutorial 4: How to Code Animated Art In JavaSc...  Black Girls Code   
2                             CODE Along: Win $2500!  Black Girls Code   
3  Don't Know What Career to Choose? Actress and ...  Black Girls Code   
4  Watch Actress Ella Balinska Recap Her Career i...  Black Girls Code   

   total_words  unique_words  
0          546           281  
1          711           271  
2          209           152  
3           61            53  
4           64            56  

Per Channel Statistics:
                          total_words  unique_words  video_count
freeCodeCamp.org             15671181         78140         1600
The Coding

## Save the pre-processed data as a new .csv

In [5]:
# Save as CSV (existing code)
df_metadata_transcript.to_csv('clean_metadata_transcript.csv', index=False)

# Also save as pickle for faster loading and preserved data types
df_metadata_transcript.to_pickle('clean_metadata_transcript.pkl')