In [1]:
# import libraries to navigate within directories
import os
import sys
import re

# the usual suspects
import pandas as pd
import numpy as np
import math
from collections import Counter, defaultdict

# Import sparse to help process NL data into chunks
from scipy import sparse # if we don't use this your computer's memory will see heaven

# NLP libraries
import nltk
## LLR calculation import
from scipy.stats import chi2_contingency
## TF-IDF calculation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize # normalize TF/TF-IDF scores

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter-specific imports
from IPython.display import clear_output
%matplotlib inline 

# Suppress warnings (optional)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
df_metadata_transcript = pd.read_csv('clean_metadata_transcript.csv')
df_metadata_transcript.head(5)

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content,preprocessed_content,total_words,unique_words
0,Black Girls Code,Have You Ever Thought About Working at Google?...,https://www.youtube.com/watch?v=ahQQpYG0Lhk,4/26/24,6:26,199330,5625,4,haveyoueverthoughtaboutworkingatgoogleepisode8...,1,hello everyone and welcome back to codal along...,"['hello', 'everyone', 'welcome', 'back', 'coda...",546,281
1,Black Girls Code,Tutorial 4: How to Code Animated Art In JavaSc...,https://www.youtube.com/watch?v=I49CXUIXvM8,4/28/23,11:18,84000,179,8,tutorial4howtocodeanimatedartinjavascriptcodea...,1,what's up for the encoders welcome back to the...,"['whats', 'encoders', 'welcome', 'back', 'four...",711,271
2,Black Girls Code,CODE Along: Win $2500!,https://www.youtube.com/watch?v=fZKyNSH2mmM,8/25/23,2:19,139194,344,18,codealongwin2500.txt,1,hey what's up everyone I'm Cheyenne and I'm yo...,"['hey', 'whats', 'everyone', 'im', 'cheyenne',...",209,152
3,Black Girls Code,Don't Know What Career to Choose? Actress and ...,https://www.youtube.com/watch?v=ohceCkLK8Wo,8/21/23,0:44,1143,40,2,dontknowwhatcareertochooseactressandgamerellab...,1,this question means so much to me because when...,"['question', 'mean', 'much', 'whenever', 'anyt...",61,53
4,Black Girls Code,Watch Actress Ella Balinska Recap Her Career i...,https://www.youtube.com/watch?v=jAYK6vJbhxU,8/19/23,0:48,3493,117,10,watchactressellabalinskarecaphercareerin60seco...,1,60 seconds okay so I started off uh with a cou...,"['second', 'started', 'couple', 'writer', 'pas...",64,56


# Log-Odds Ratio vs. TFDs Visualization per YouTube Channel
[DEF]: Statistical measure that compares the probability of a word occurring in one corpus versus another, expressed as a logarithmic ratio. It helps identify words that are disproportionately used in one channel compared to others. It's... Step 1: Calculate the Odds for Each Corpus
For Corpus 1: Take the probability of a word occurring (p1) divided by the probability of it not occurring (1-p1)
For Corpus 2: Take the probability of a word occurring (p2) divided by the probability of it not occurring (1-p2)
Step 2: Create the Ratio
Divide the odds from Corpus 1 by the odds from Corpus 2
Step 3: Take the Logarithm
Apply the natural logarithm to this ratio
 
* In the context of our data, it would help:
    * Compare word usage patterns between different YouTube channels
    * Identify characteristic vocabulary unique to specific channels
    * Measure the relative frequency of terms between different corpora
    * Visualize word distribution patterns across channels
* How does this differ from LLR and other metrics?
    * Uses probability ratios instead of statistical significance testing
    * Provides a more straightforward interpretation of word frequency differences
    * Can be combined with Term Frequency Distribution (TFD) for richer analysis
    * Better for quick comparative analysis without assuming statistical significance
* Interpretation:
    * Positive log-odds indicate words more characteristic of the target channel
    * Negative log-odds show words underrepresented in the target channel
    * The magnitude indicates the strength of the association
    * Can be visualized against TFD for better understanding of word importance1
* Implementation:
    * Requires calculating word frequencies in both corpora. Applies smoothing to handle zero frequencies. Uses numpy for logarithmic calculations. Can be visualized using scatter plots of log-odds vs TFD

In [3]:
def calculate_log_odds_vs_tfd(df):
    output_dir = 'Log_Odds_Analysis'
    os.makedirs(output_dir, exist_ok=True)
    
    def calculate_channel_stats(channel_docs, all_docs):
        # Convert string representations of lists to actual lists if needed
        def convert_to_list(doc):
            if isinstance(doc, str):
                # Remove brackets and split on commas
                doc = doc.strip('[]').replace("'", "").split(', ')
            return [word for word in doc if word]
        
        # Convert and filter documents
        channel_docs = [convert_to_list(doc) for doc in channel_docs if doc is not None]
        all_docs = [convert_to_list(doc) for doc in all_docs if doc is not None]
        
        if not channel_docs:
            raise ValueError("No valid documents found in channel")
        
        # Flatten the tokens for frequency counting
        channel_tokens = [token for doc in channel_docs for token in doc]
        all_tokens = [token for doc in all_docs for token in doc]
        
        # Calculate frequencies
        channel_freq = Counter(channel_tokens)
        corpus_freq = Counter(all_tokens)
        
        # Calculate total words
        channel_total = sum(channel_freq.values())
        corpus_total = sum(corpus_freq.values())
        
        # Calculate log odds and TFD
        log_odds_tfd = {}
        for word in channel_freq:
            p1 = (channel_freq[word] + 0.5) / (channel_total + 1)
            p2 = ((corpus_freq[word] - channel_freq[word] + 0.5) / 
                (corpus_total - channel_total + 1))
            
            log_odds = np.log(p1/p2)
            tfd = channel_freq[word] / channel_total
            
            log_odds_tfd[word] = {'log_odds': log_odds, 'tfd': tfd}
        
        return pd.DataFrame.from_dict(log_odds_tfd, orient='index')
    
    # Process each channel
    for channel, group in df.groupby('channel_name'):
        try:
            # Get preprocessed content (already tokenized)
            channel_docs = group['preprocessed_content'].tolist()
            all_docs = df['preprocessed_content'].tolist()
            
            # Calculate statistics
            results_df = calculate_channel_stats(channel_docs, all_docs)
            results_df.index.name = 'term'
            
            # Create visualization
            plt.figure(figsize=(12, 8))
            plt.scatter(results_df['tfd'], results_df['log_odds'], alpha=0.5)
            plt.xlabel('Term Frequency Distribution (TFD)')
            plt.ylabel('Log-Odds Ratio')
            plt.title(f'Log-Odds vs TFD for {channel}')
            
            # Add annotations for extreme points
            extreme_points = results_df.nlargest(10, 'log_odds')
            for term in extreme_points.index:
                plt.annotate(
                    term,
                    xy=(extreme_points.loc[term, 'tfd'], 
                        extreme_points.loc[term, 'log_odds']),
                    xytext=(5, 5),
                    textcoords='offset points'
                )
            
            # Save plot and data
            safe_channel_name = "".join(x for x in channel if x.isalnum() or x in (' ', '-', '_'))
            plt.savefig(os.path.join(output_dir, f'{safe_channel_name}_log_odds_tfd.png'),
                       bbox_inches='tight', dpi=300)
            plt.close()
            
            results_df.to_csv(
                os.path.join(output_dir, f'{safe_channel_name}_log_odds_tfd.csv'),
                index=True
            )
            
            print(f"Created Log-Odds vs TFD analysis for {channel}")
            
        except Exception as e:
            print(f"Error processing channel {channel}: {str(e)}")
            continue

# Execute the analysis
calculate_log_odds_vs_tfd(df_metadata_transcript)

Created Log-Odds vs TFD analysis for AlvinBlox
Created Log-Odds vs TFD analysis for Beginner's Code Zone
Created Log-Odds vs TFD analysis for Black Girls Code
Created Log-Odds vs TFD analysis for Brackeys
Created Log-Odds vs TFD analysis for Bro Code
Created Log-Odds vs TFD analysis for CS Dojo
Created Log-Odds vs TFD analysis for CS50 Made Easy with Rahul
Created Log-Odds vs TFD analysis for Cave of Programming
Created Log-Odds vs TFD analysis for Clear Code
Created Log-Odds vs TFD analysis for Clever Programmer
Created Log-Odds vs TFD analysis for Code Monkey
Created Log-Odds vs TFD analysis for Code With Conner
Created Log-Odds vs TFD analysis for Code.org
Created Log-Odds vs TFD analysis for CodeBeauty
Created Log-Odds vs TFD analysis for CodeMonkey - Coding Games for Kids
Created Log-Odds vs TFD analysis for CodeWithChris
Created Log-Odds vs TFD analysis for Codecademy
Created Log-Odds vs TFD analysis for Coding Kids
Created Log-Odds vs TFD analysis for Coding With Kids
Created Lo

# Log-Likelihodo Ratio (LLR), Fightin'Words, and Frequency Shifts
i think these will probably do better for visualizing relationships between different YouTube Videos and Channels


## Log-Likelihood Ratio (LLR)
- [DEF]: Statistical method that compares frequency of words in two different corpora, determining which words are more strongly associated with one corpus; essentially, we find distinctive words between two groups of texts, where that set of distinctive words is more associated with one group than another (positive vs. negative review language)! what words are "over-represented" and "under-represented" in one group of texts compared to another?
- In the context of our data, it would help: 
    - Determine which words are significantly overrepresented or underrepresented in one group of video transcripts compared to another
    - Identify characteristic vocabulary that distinguishes different channels or video categories
    - Quantify the strength of association between specific words and YouTube Channels
- How does this differ from what we've done before (TFs, TFDs, TF-IDFs...)?
    - Compares word frequencies between two different corpora statistically, instead of multiple TF by IDF
    - Tests whether word frequency differences between groups are statistically significant
    - Better suited for comparing entire collections against each other
    - Accounts for both presence and absence of terms
- Interpretation:
    - A higher LLR score for a word in a specific channel indicates the word is distinctively assocaited with that channel's content
    - Simultaneiously, the score measures how unlikely it is that the word's frequency in that channel occured by chance compared to its appear across all channels
    - And it's statistically significant!!!
- Implementation: use just python's math library OR scipy.stats to manually code an implementation, or we can just find one online and adapt it for our uses... because it's literally just a bunch of math to code LLR tbh

We will produce comparisons between YouTube channels, with BGC being our northstar comparison.

NOTE: LLR vs. frequency visualization

In [None]:
# def process_channel_content(texts):
#     """Process content for a single channel"""
#     processed = []
#     for text in texts:
#         if isinstance(text, str):
#             words = text.strip('[]').replace("'", "").split(',')
#             processed.extend([word.strip() for word in words 
#                             if len(word.strip()) > 1 and word.strip().isalnum()])
#     return processed

# def calculate_pairwise_llr(texts1, texts2):
#     """Calculate LLR scores between two channels"""
#     # Process texts for both channels
#     words1 = process_channel_content(texts1)
#     words2 = process_channel_content(texts2)
    
#     # Get word counts for both channels
#     counts1 = Counter(words1)
#     counts2 = Counter(words2)
    
#     total_words1 = sum(counts1.values())
#     total_words2 = sum(counts2.values())
    
#     # Calculate LLR for each word
#     word_llrs = {}
#     vocab = set(counts1.keys()) | set(counts2.keys())
    
#     for word in vocab:
#         if (counts1.get(word, 0) + counts2.get(word, 0)) >= 2:
#             # Contingency table values
#             a = counts1.get(word, 0)  # word count in channel 1
#             b = total_words1 - a      # non-word count in channel 1
#             c = counts2.get(word, 0)  # word count in channel 2
#             d = total_words2 - c      # non-word count in channel 2
            
#             contingency = np.array([[a, b], [c, d]])
            
#             try:
#                 llr_stat = chi2_contingency(contingency, lambda_="log-likelihood")[0]
#                 word_llrs[word] = {
#                     'llr_score': llr_stat,
#                     'count_channel1': a,
#                     'count_channel2': c
#                 }
#             except ValueError:
#                 continue
    
#     return word_llrs

# def analyze_bgc_pairwise_llr(df):
#     """Analyze Black Girls Code against all other channels using LLR"""
#     # Create output directory
#     output_dir = 'black_girls_code_llr_comparisons'
#     os.makedirs(output_dir, exist_ok=True)
    
#     # Get Black Girls Code content
#     bgc_texts = df[df['channel_name'] == 'Black Girls Code']['preprocessed_content'].tolist()
#     if not bgc_texts:
#         raise ValueError("No Black Girls Code content found in dataset")
    
#     results = {}
#     all_comparisons = []
    
#     # Compare with each other channel
#     for channel in df['channel_name'].unique():
#         if channel != 'Black Girls Code':
#             try:
#                 # Get channel texts
#                 other_texts = df[df['channel_name'] == channel]['preprocessed_content'].tolist()
                
#                 # Calculate LLR scores
#                 word_llrs = calculate_pairwise_llr(bgc_texts, other_texts)
                
#                 if word_llrs:
#                     # Convert to DataFrame
#                     comparison = pd.DataFrame.from_dict(word_llrs, orient='index')
#                     comparison['word'] = comparison.index
#                     comparison = comparison.sort_values('llr_score', ascending=False)
                    
#                     # Save individual comparison
#                     safe_channel = "".join(x for x in channel if x.isalnum() or x in (' ', '-', '_'))
#                     filename = f'BGC_vs_{safe_channel}_llr.csv'
#                     filepath = os.path.join(output_dir, filename)
#                     comparison.to_csv(filepath, index=False)
                    
#                     results[channel] = comparison
#                     all_comparisons.append(comparison.assign(compared_channel=channel))
                    
#                     print(f"\nCompleted LLR comparison: Black Girls Code vs {channel}")
#                     if len(comparison) > 0:
#                         print("\nTop 10 words by LLR score:")
#                         print(comparison[['word', 'llr_score', 'count_channel1', 'count_channel2']].head(10))
                
#             except Exception as e:
#                 print(f"Error comparing with {channel}: {str(e)}")
#                 continue
    
#     # Save combined results
#     if all_comparisons:
#         combined = pd.concat(all_comparisons, ignore_index=True)
#         combined.to_csv(os.path.join(output_dir, 'all_llr_comparisons.csv'), index=False)
#         print("\nSaved combined LLR comparisons file")
    
#     return results

# def main(df):
#     """Main analysis pipeline"""
#     if df['preprocessed_content'].isna().all():
#         raise ValueError("No valid content found in preprocessed_content column")
    
#     print("Starting Black Girls Code LLR comparisons...")
#     results = analyze_bgc_pairwise_llr(df)
#     print("LLR Analysis complete!")
#     return results

# # Run the analysis
# if __name__ == "__main__":
#     results = main(df_metadata_transcript)

Starting Black Girls Code LLR comparisons...

Completed LLR comparison: Black Girls Code vs Codecademy

Top 10 words by LLR score:
          word    llr_score  count_channel1  count_channel2
girl      girl  1107.230000             246              27
black    black   911.628811             245              85
block    block   749.612132             250             177
art        art   384.290527             118              65
woman    woman   377.511784             111              53
data      data   364.893311              29            3607
sprite  sprite   348.185160              86              19
afro      afro   273.434983              54               0
game      game   268.524805             217             618
young    young   192.461704              56              25

Completed LLR comparison: Black Girls Code vs Coding for kids

Top 10 words by LLR score:
          word   llr_score  count_channel1  count_channel2
print    print  540.548529               2             419


## Fighting Words
- [DEF] extension of LLR. uses z-scores to adjust word frequency differences, highlighting over- and under-represented words between two corpora (in other words, highling polarized or distinctive words; FightingWords has historically been used for political speeches, for example). typically used when there is a need to focus on identifying polarized language or specific distinctions in word usage between corpora.
- Interpretation: so, FightingWords calculate Z-Scores on a 95% confidence interval, corresponding to a Z-Score range of [-1.96,1.96]; any Z-Score outside of that is statistically significant at 95% confidence, where a positive Z-Score is more indicative of Channel A and a negative Z-Score is indicative of Channel B. As a practical example, let's say we were comparing word usage between Black Girls Code versus code.org. If we looked up that specific comparison .csv and looked at the word "black": the Z-Score is +25.37, meaning that the word "Black" is very strongly associated with Black Girls Code in this particular channel comparison.
- In the context of our data, it would help:
    - Highlight polarized or distinctive language between different channels
    - Identifying words that are strongly associated with specific content types
    - ** Detecting significant vocabulary diferences between different groups of videos **
- How does this differ from what we've done before (TFs, TFDs, TF-IDFs...)?
    - Incorporates statistical significance through using z-scores
    - Accounts for sampling variability in word frequencies
    - More robust when dealing with varying transcript lengths
- Implementation: we'd basically need to "borrow" the code from open-source repositories, adapt it for our own purposes, save it as a Python script... like this one (https://github.com/jmhessel/FightingWords) or this one from Myra Cheng, a PhD in CS advised by Dan Jurafsky (https://github.com/myracheng/markedpersonas/blob/main/marked_words.py). ***Our implementation ended up taking from both.***

In [None]:
# from collections import defaultdict
# from typing import List, Dict

# class BlackGirlsCodeComparison:
#     """Class to perform FightingWords analysis comparing Black Girls Code with other channels"""
    
#     def __init__(self, prior: float = 0.01):
#         """
#         Initialize with smoothing prior
#         Args:
#             prior (float): Smoothing parameter for word frequencies
#         """
#         self.prior = prior
        
#     def process_text(self, text: str) -> List[str]:
#         """
#         Process text to get clean word list
#         Args:
#             text (str): Input text to process
#         Returns:
#             List[str]: List of cleaned words
#         """
#         if isinstance(text, str):
#             # Handle string representation of list
#             if text.startswith('[') and text.endswith(']'):
#                 text = text.strip('[]').replace("'", "").replace(",", " ")
#             return [word.lower() for word in text.split() if word.isalnum()]
#         return []
        
#     def calculate_zscores(self, texts1: List[str], texts2: List[str], 
#                          channel_name: str) -> pd.DataFrame:
#         """
#         Calculate z-scores between two text corpora using Monroe et al.'s method
#         Args:
#             texts1 (List[str]): Black Girls Code texts
#             texts2 (List[str]): Comparison channel texts
#             channel_name (str): Name of channel being compared
#         Returns:
#             pd.DataFrame: DataFrame with word comparison statistics
#         """
#         # Initialize word count dictionaries
#         counts1 = defaultdict(int)  # BGC word counts
#         counts2 = defaultdict(int)  # Other channel word counts
        
#         # Count words in each corpus
#         for text in texts1:
#             for word in self.process_text(text):
#                 counts1[word] += 1
                
#         for text in texts2:
#             for word in self.process_text(text):
#                 counts2[word] += 1
        
#         # Skip if either corpus is empty
#         if not counts1 or not counts2:
#             return pd.DataFrame()
        
#         # Calculate statistics
#         results = []
#         n1 = sum(counts1.values())  # Total words in BGC
#         n2 = sum(counts2.values())  # Total words in other channel
#         vocab = set(counts1.keys()) | set(counts2.keys())
        
#         # Calculate z-scores for each word
#         for word in vocab:
#             # Add smoothing
#             c1 = counts1[word] + self.prior
#             c2 = counts2[word] + self.prior
            
#             # Calculate rates and variance
#             rate1 = c1 / (n1 + len(vocab) * self.prior)
#             rate2 = c2 / (n2 + len(vocab) * self.prior)
#             var = (1/c1) + (1/c2)
            
#             if var > 0:
#                 # Calculate z-score using Monroe et al.'s method
#                 delta = np.log(rate1) - np.log(rate2)
#                 zscore = delta / np.sqrt(var)
                
#                 results.append({
#                     'word': word,
#                     'z_score': zscore,
#                     'bgc_count': counts1[word],
#                     'other_count': counts2[word],
#                     'compared_channel': channel_name
#                 })
        
#         return pd.DataFrame(results) if results else pd.DataFrame()

# def analyze_bgc_comparisons(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
#     """
#     Analyze Black Girls Code against all other channels
#     Args:
#         df (pd.DataFrame): DataFrame with channel content
#     Returns:
#         Dict[str, pd.DataFrame]: Dictionary of comparison results per channel
#     """
#     # Create output directory
#     output_dir = 'black_girls_code_comparisons'
#     os.makedirs(output_dir, exist_ok=True)
    
#     # Get Black Girls Code texts
#     bgc_texts = df[df['channel_name'] == 'Black Girls Code']['preprocessed_content'].tolist()
#     if not bgc_texts:
#         raise ValueError("No Black Girls Code content found in dataset")
    
#     # Initialize analyzer and results
#     bgc = BlackGirlsCodeComparison()
#     results = {}
#     all_comparisons = []
    
#     # Compare with each channel
#     for channel in df['channel_name'].unique():
#         if channel != 'Black Girls Code':
#             try:
#                 # Get channel texts
#                 other_texts = df[df['channel_name'] == channel]['preprocessed_content'].tolist()
                
#                 # Calculate comparison
#                 comparison = bgc.calculate_zscores(bgc_texts, other_texts, channel)
                
#                 if not comparison.empty:
#                     # Sort by absolute z-score
#                     comparison['abs_z_score'] = comparison['z_score'].abs()
#                     comparison = comparison.sort_values('abs_z_score', ascending=False)
#                     comparison = comparison.drop('abs_z_score', axis=1)
                    
#                     # Save individual comparison
#                     safe_channel = "".join(x for x in channel if x.isalnum() or x in (' ', '-', '_'))
#                     filename = f'BGC_vs_{safe_channel}_comparison.csv'
#                     filepath = os.path.join(output_dir, filename)
#                     comparison.to_csv(filepath, index=False)
                    
#                     results[channel] = comparison
#                     all_comparisons.append(comparison)
                    
#                     # Print progress and top words
#                     print(f"\nCompleted comparison: Black Girls Code vs {channel}")
#                     if len(comparison) > 0:
#                         print("\nTop 10 words distinctive to Black Girls Code:")
#                         print(comparison[comparison['z_score'] > 0].head(10)[['word', 'z_score']])
#                         print(f"\nTop 10 words distinctive to {channel}:")
#                         print(comparison[comparison['z_score'] < 0].head(10)[['word', 'z_score']])
                
#             except Exception as e:
#                 print(f"Error comparing with {channel}: {str(e)}")
#                 continue
    
#     # Save combined results
#     if all_comparisons:
#         combined = pd.concat(all_comparisons, ignore_index=True)
#         combined.to_csv(os.path.join(output_dir, 'all_comparisons.csv'), index=False)
#         print("\nSaved combined comparisons file")
    
#     return results

# def main(df: pd.DataFrame):
#     """
#     Main analysis pipeline
#     Args:
#         df (pd.DataFrame): Input DataFrame with channel content
#     Returns:
#         Dict[str, pd.DataFrame]: Analysis results
#     """
#     # Verify data
#     if df['preprocessed_content'].isna().all():
#         raise ValueError("No valid content found in preprocessed_content column")
    
#     print("Starting Black Girls Code comparisons...")
#     results = analyze_bgc_comparisons(df)
#     print("Analysis complete!")
#     return results

# # Run the analysis
# if __name__ == "__main__":
#     results = main(df_metadata_transcript)

Starting Black Girls Code comparisons...

Completed comparison: Black Girls Code vs Codecademy

Top 10 words distinctive to Black Girls Code:
         word    z_score
8514    block  29.256177
11731   black  28.496958
3565     girl  23.372806
1872      art  20.231130
7842    woman  19.572805
7417     game  18.781732
5847   sprite  15.933149
4365     love  15.876108
7780   circle  15.269409
3390    level  14.633201

Top 10 words distinctive to Codecademy:
          word    z_score
6793      data -12.308662
6578      kind  -9.538185
11730      see  -9.142460
5425    number  -8.852147
6408       bit  -8.687348
12901   little  -7.682166
952      might  -7.563243
3751      list  -7.523329
8965      file  -7.345323
585    example  -7.170319

Completed comparison: Black Girls Code vs Coding for kids

Top 10 words distinctive to Black Girls Code:
          word    z_score
5003      know  11.941572
4403    people  11.850270
1008     block   9.649677
6474     black   9.379422
4109     thank   8.6