# Data Imports

In [1]:
# Core data libraries
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from scipy import sparse
import pickle

# Visualization basics
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download

# Interactive components
from ipywidgets import interact, widgets

# Analysis tools
from sklearn.manifold import TSNE
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer

# Display settings
import plotly.io as pio
pio.renderers.default = 'notebook'

# run a virtual environment of Python 3.9.0 .venv\Scripts\activate

In [2]:
# Replace the pickle loading with CSV loading
df_metadata_transcript = pd.read_csv('clean_metadata_transcript.csv')

# If the preprocessed_content column contains string representations of lists
# Convert it back to actual lists
# df_metadata_transcript['preprocessed_content'] = df_metadata_transcript['preprocessed_content'].apply(eval)

# We're going to create a series of visualizations that allow us to better sift through the data and do our own analysis:

## 1. Topical Focus -- Topic Distribution Dashboard
Leverage the Topic Distribution Dashboard to reveal:
* Channel specializations
* Common themes across different teaching styles2
* Educational approach variations

## 2. Channel Relationships -- Log-Odds Ratio Comparison
Utilize Log-Odds comparisons to demonstrate:
* Teaching style differences
* Target audience variations
* Technical depth variations

## 3. Educational Approaches -- NLP Metrics Channel Comparison
Utilize TF-IDFs, Normalized TF-IDFs, LLR, and Fightin'Words to identify:
* Teach style differences
* Complexity level variations
* Target audience adaptations

## For preliminary results, we compare Black Girls Code (identity-oriented) against Code.org (identity-agnostic).

# 1. Topic Distribution Dashboard
The x-axis seems to pit "Technical Content <-> Community Focus" from left-to-right, suggestions a contrast between theoretical topics in CS versus practical application; the y-axis seems to contrast "Beginner Topics <-> Advanced Topics" from bottom to top. Unfortunately, BERTopic won't allow us to directly edit the axis labels for readability.

We can see this in the comparison between BGC and code.org: it seems that BGC on the distance map trends towards the bottom-right while code.org trends towards the top-left. You can verify the patterns by looking at the table of topic terms on the right, and indeed we see that BGC is more communtiy focused than code.org whom seems to focus more on teaching computer science.

In [None]:
def load_channel_models(base_dir='./channel_topic_analysis'):
    channel_models = {}
    topic_distributions = {}
    
    if not os.path.exists(base_dir):
        raise ValueError(f"Directory {base_dir} does not exist")
        
    for channel_dir in os.listdir(base_dir):
        channel_path = os.path.join(base_dir, channel_dir)
        if os.path.isdir(channel_path):
            try:
                model_path = os.path.join(channel_path, 'topic_model.pkl')
                dist_path = os.path.join(channel_path, 'topic_distributions.pkl')
                
                if not os.path.exists(model_path) or not os.path.exists(dist_path):
                    print(f"Missing files for {channel_dir}")
                    continue
                
                # Add try-except specifically for model loading
                try:
                    with open(model_path, 'rb') as f:
                        model = pickle.load(f)
                    # Verify model has required attributes
                    if not hasattr(model, 'topic_embeddings_'):
                        print(f"Invalid model format for {channel_dir}")
                        continue
                    channel_models[channel_dir] = model
                except Exception as e:
                    print(f"Model loading error for {channel_dir}: {str(e)}")
                    continue
                    
                with open(dist_path, 'rb') as f:
                    topic_distributions[channel_dir] = pickle.load(f)
            except Exception as e:
                print(f"Error loading {channel_dir}: {str(e)}")
                continue
    
    if not channel_models:
        raise ValueError("No valid models found in directory")
        
    return channel_models, topic_distributions

# In your dashboard notebook:
channel_models, channel_distributions = load_channel_models()
# Then use these with your dashboard code

def create_topic_distribution_dashboard(channel_models, topic_distributions, selected_channels=None, n_topics=10):
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Intertopic Distance Map', 'Topic Terms'),
        specs=[[{'type': 'scatter'}, {'type': 'table'}]],
        horizontal_spacing=0.1
    )
    
    colors = px.colors.qualitative.Set3
    combined_dist_df = pd.DataFrame()
    
    channels_to_process = ([selected_channels] if isinstance(selected_channels, str) 
                         else selected_channels if selected_channels 
                         else ['All'])
    
    for idx, channel in enumerate(channels_to_process):
        model = channel_models[channel] if channel != 'All' else list(channel_models.values())[0]
        
        if hasattr(model, 'topic_embeddings_'):
            coords = model.topic_embeddings_
            n_topics = coords.shape[0]
            perplexity = min(30, n_topics - 1)
            
            tsne = TSNE(
                n_components=2,
                random_state=42,
                perplexity=max(perplexity, 1)
            )
            coords_2d = tsne.fit_transform(coords)
            
            # Generate meaningful topic labels
            topic_labels = {}
            for topic_id in range(len(coords)):
                topic_words = model.get_topic(topic_id)
                if hasattr(model, 'custom_labels_') and topic_id in model.custom_labels_:
                    topic_labels[topic_id] = model.custom_labels_[topic_id]
                else:
                    # Create descriptive label from top words
                    words = [word for word, _ in topic_words[:3]]
                    topic_labels[topic_id] = f"{' | '.join(words)}"
            
            fig.add_trace(
                go.Scatter(
                    x=coords_2d[:, 0],
                    y=coords_2d[:, 1],
                    mode='markers+text',
                    text=[topic_labels[i] for i in range(len(coords))],
                    textposition="top center",
                    name=channel,
                    marker=dict(
                        size=10,
                        color=colors[idx % len(colors)],
                        line=dict(width=1)
                    ),
                    hovertemplate='<b>%{text}</b><br>Channel: ' + channel + '<extra></extra>'
                ),
                row=1, col=1
            )
    
    # Create combined topic terms table with meaningful labels
    all_topics_terms = []
    for channel in channels_to_process:
        model = channel_models[channel] if channel != 'All' else list(channel_models.values())[0]
        
        for topic_id in range(len(model.get_topic_freq())):
            try:
                terms = model.get_topic(topic_id)
                if isinstance(terms, list) and len(terms) > 0:
                    # Get topic label
                    if hasattr(model, 'custom_labels_') and topic_id in model.custom_labels_:
                        topic_label = model.custom_labels_[topic_id]
                    else:
                        topic_label = f"{', '.join([word for word, _ in terms[:3]])}"
                    
                    terms_string = ', '.join([term for term, _ in terms[:5]])
                    weight_string = ', '.join([f"{weight:.3f}" for _, weight in terms[:5]])
                    channel_name = channel if channel != 'All' else 'Combined'
                    all_topics_terms.append([
                        f"{channel_name} - {topic_label}",
                        terms_string,
                        weight_string
                    ])
            except (TypeError, IndexError):
                continue
    
    fig.add_trace(
        go.Table(
            header=dict(
                values=['Topic', 'Top Terms', 'Weights'],
                font=dict(size=12, color='white'),
                fill_color='rgb(55, 83, 109)'
            ),
            cells=dict(
                values=list(zip(*all_topics_terms)),
                font=dict(size=11),
                align=['left'] * 3
            )
        ),
        row=1, col=2
    )
    
    fig.update_layout(
        height=800,
        width=1400,
        showlegend=True,
        legend_title_text="Channels",
        title_text="Channel Topic Comparison Dashboard",
        title_x=0.5,
        template="plotly_white",
        margin=dict(t=100, b=50, l=50, r=50)
    )
    
    return fig

def create_interactive_dashboard(channel_models, topic_distributions):
    if not channel_models:
        raise ValueError("No channel models provided")
    
    channel_options = list(channel_models.keys())
    
    # Create multi-select for channels
    channel_selector = widgets.SelectMultiple(
        options=['All'] + channel_options,
        value=['All'],
        description='Channels:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='50%', height='100px')
    )
    
    # Get maximum number of topics
    max_topics = min(20, max(len(model.get_topic_freq()) 
                    for model in channel_models.values()))
    
    interact(
        lambda c: create_topic_distribution_dashboard(
            channel_models,
            topic_distributions,
            None if 'All' in c else list(c)
        ),
        c=channel_selector
    )

def verify_models(channel_models):
    for channel, model in channel_models.items():
        print(f"\nVerifying {channel}:")
        print(f"  Has topic_embeddings_: {hasattr(model, 'topic_embeddings_')}")
        print(f"  Has get_topic method: {hasattr(model, 'get_topic')}")
        print(f"  Has get_topic_freq method: {hasattr(model, 'get_topic_freq')}")
        if hasattr(model, 'topic_embeddings_'):
            print(f"  Number of topics: {len(model.topic_embeddings_)}")

# Verify before creating dashboard
channel_models, topic_distributions = load_channel_models()
verify_models(channel_models)
create_interactive_dashboard(channel_models, topic_distributions)

Missing files for Coding with Mat
Missing files for Computer World Kids
Missing files for Low Level Learning
Missing files for Coding with Mat
Missing files for Computer World Kids
Missing files for Low Level Learning

Verifying AlvinBlox:
  Has topic_embeddings_: True
  Has get_topic method: True
  Has get_topic_freq method: True
  Number of topics: 5

Verifying Black Girls Code:
  Has topic_embeddings_: True
  Has get_topic method: True
  Has get_topic_freq method: True
  Number of topics: 5

Verifying Brackeys:
  Has topic_embeddings_: True
  Has get_topic method: True
  Has get_topic_freq method: True
  Number of topics: 5

Verifying Bro Code:
  Has topic_embeddings_: True
  Has get_topic method: True
  Has get_topic_freq method: True
  Number of topics: 5

Verifying Cave of Programming:
  Has topic_embeddings_: True
  Has get_topic method: True
  Has get_topic_freq method: True
  Number of topics: 5

Verifying Clear Code:
  Has topic_embeddings_: True
  Has get_topic method: True


interactive(children=(SelectMultiple(description='Channels:', index=(0,), layout=Layout(height='100px', width=…

# 2. Create Log-Odds Visualization
More granular analysis of words most characteristic of a channel, between channels. It's interesting the BGC, compared to code.org, uniquely uses words like "afro" and "afrotech". Though I think because Log-Odds is drawing out the most characteristic words of a channel, it's also drawing out the most unique words. Some of these words are incredibly unique for their channel, and I think I'd have to watch the video associated with the word to understand the context.

In [6]:
def load_log_odds_data(directory='Log_Odds_Analysis'):
    log_odds_results = {}
    
    for filename in os.listdir(directory):
        if filename.endswith('_log_odds_tfd.csv'):
            channel_name = filename.replace('_log_odds_tfd.csv', '')
            file_path = os.path.join(directory, filename)
            
            # Read CSV with index as terms
            df = pd.read_csv(file_path, index_col='term')
            
            log_odds_results[channel_name] = {
                'log_odds': df['log_odds'].values,
                'tfd': df['tfd'].values,
                'terms': df.index.values  # Store actual terms
            }
    
    return log_odds_results

def create_log_odds_visualization(log_odds_results, n_terms=20):
    # Generate colors automatically using a qualitative color scale
    n_channels = len(log_odds_results)
    colors = px.colors.qualitative.Set3[:n_channels]
    
    if n_channels > 12:
        colors = px.colors.sample_colorscale('viridis', n_channels)
    
    channel_colors = dict(zip(log_odds_results.keys(), colors))

    # Create 1x2 subplot layout for vertical orientation
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Channel Term Comparison', 'Top Distinctive Terms'),
        specs=[[{'type': 'scatter'}, {'type': 'bar'}]],
        horizontal_spacing=0.2  # Increased spacing between plots
    )
    
    for channel, data in log_odds_results.items():
        color = channel_colors.get(channel, '#666666')
        df = pd.DataFrame({
            'log_odds': data['log_odds'],
            'tfd': data['tfd'],
            'terms': data['terms']
        })
        
        # Scatter plot
        df_sorted = df.sort_values('log_odds', key=abs, ascending=False)
        top_terms = df_sorted.head(n_terms)
        
        fig.add_trace(
            go.Scatter(
                x=top_terms['tfd'],
                y=top_terms['log_odds'],
                mode='markers+text',
                name=channel,
                text=top_terms['terms'],
                textposition="top center",
                marker=dict(size=10, color=color)
            ),
            row=1, col=1
        )
        
        # Bar chart
        top_distinctive = df_sorted.head(10)
        fig.add_trace(
            go.Bar(
                x=top_distinctive['log_odds'],
                y=top_distinctive['terms'],
                orientation='h',
                name=f"{channel} Top Terms",
                marker_color=color
            ),
            row=1, col=2
        )
    
    # Update layout
    fig.update_layout(
        height=800,
        width=1400,
        showlegend=True,
        title_text="Channel Content Analysis",
        template="plotly_white"
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Term Frequency Distribution", row=1, col=1)
    fig.update_yaxes(title_text="Log-Odds Ratio", row=1, col=1)
    fig.update_xaxes(title_text="Log-Odds Ratio", row=1, col=2)
    fig.update_yaxes(title_text="Terms", row=1, col=2)
    
    return fig

def create_interactive_log_odds_dashboard(log_odds_results):
    instructions = widgets.HTML(
        value="<p style='margin-left:15px; padding-top:5px'><i>Hold Shift to select multiple consecutive channels<br>or click individual channels to toggle selection</i></p>"
    )

    channel_selector = widgets.SelectMultiple(
        options=list(log_odds_results.keys()),
        value=[list(log_odds_results.keys())[0]],
        description='Channels:',
        layout=widgets.Layout(width='250px', height='200px'),
        rows=10,
        style={'description_width': 'initial'}
    )

    terms_slider = widgets.IntSlider(
        value=20,
        min=5,
        max=50,
        step=5,
        description='Top Terms:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    controls = widgets.VBox([
        widgets.HBox([channel_selector, instructions]),
        terms_slider
    ])
    
    def update_viz(channels, n_terms):
        selected_results = {k: log_odds_results[k] for k in channels}
        return create_log_odds_visualization(selected_results, n_terms)
    
    interact(update_viz, channels=channel_selector, n_terms=terms_slider)

# Add interactive controls
def create_interactive_log_odds_dashboard(log_odds_results):
    instructions = widgets.HTML(
        value="<p style='margin-left:15px; padding-top:5px'><i>Hold Shift to select multiple consecutive channels<br>or click individual channels to toggle selection</i></p>"
    )

    channel_selector = widgets.SelectMultiple(
        options=list(log_odds_results.keys()),
        value=[list(log_odds_results.keys())[0]],
        description='Channels:',
        layout=widgets.Layout(width='250px', height='200px'),
        rows=10,
        style={'description_width': 'initial'}
    )

    terms_slider = widgets.IntSlider(
        value=20,
        min=5,
        max=50,
        step=5,
        description='Top Terms:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    controls = widgets.VBox([
        widgets.HBox([channel_selector, instructions]),
        terms_slider
    ])
    
    def update_viz(channels, n_terms):
        selected_results = {k: log_odds_results[k] for k in channels}
        return create_log_odds_visualization(selected_results, n_terms)
    
    interact(update_viz, channels=channel_selector, n_terms=terms_slider)

# Load the data and create the visualization
log_odds_results = load_log_odds_data()
create_interactive_log_odds_dashboard(log_odds_results)

interactive(children=(SelectMultiple(description='Channels:', index=(0,), layout=Layout(height='200px', width=…

# 4: NLP Metrics Channel Comparison

## Interesting Trends:

### Log-Likelihood Ratio
There is a strong focus on identity markers with the words “girl”, “black”, “woman”, and “afro” being amongst the top LLR-scored words.

Code.org LLR scores produced words related to academics, specifically computer science education, with words like “computer”, “science”, “school”, and “education” being top words. BGC also has some of these terms, but it seems that they’re more characteristic for Code.org.

### Fightin' Words
By the z-scores, it looks like BGC compared to Code.org distinctively has terms relating to identity and empowerment (“black”, z=14.171; “girl”, z=14.051) and some technical programming words (“script”, z=7.644; “block”, z=7.869). 

If you switch the comparison to Channel 1: Code.org and Channel 2: BGC, there seems to be a strong emphasis on formal computer science education for Code.org(“computer”, z=-49.91; “student”, z=-46.431; “science”, z=-41.637; and “school”, z=-22.336).

In [5]:
try:
    # Try loading pickle first
    df_metadata_transcript = pd.read_pickle('clean_metadata_transcript.pkl')
except:
    # If pickle fails, load CSV and convert preprocessed_content back to lists
    df_metadata_transcript = pd.read_csv('clean_metadata_transcript.csv')
    df_metadata_transcript['preprocessed_content'] = df_metadata_transcript['preprocessed_content'].apply(eval)

# TF-IDF calculation function
def calculate_tfidf_comparison(texts1, texts2):
    # Join tokens back into strings
    all_texts = [' '.join(text) for text in texts1 + texts2]
    
    # Calculate regular TF-IDF
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(all_texts)
    
    # Get average scores for each channel
    n1 = len(texts1)
    channel1_tfidf = np.mean(tfidf_matrix[:n1].toarray(), axis=0)
    channel2_tfidf = np.mean(tfidf_matrix[n1:].toarray(), axis=0)
    
    results = []
    for idx, term in enumerate(tfidf.get_feature_names_out()):
        results.append({
            'word': term,
            'tfidf_diff': channel1_tfidf[idx] - channel2_tfidf[idx],
            'channel1_tfidf': channel1_tfidf[idx],
            'channel2_tfidf': channel2_tfidf[idx]
        })
    
    return pd.DataFrame(results).sort_values('tfidf_diff', ascending=False)

def calculate_normalized_tfidf_comparison(texts1, texts2):
    # Join tokens back into strings
    all_texts = [' '.join(text) for text in texts1 + texts2]
    
    # Calculate normalized TF-IDF
    tfidf_norm = TfidfVectorizer(norm='l1')
    tfidf_norm_matrix = tfidf_norm.fit_transform(all_texts)
    
    # Get average scores for each channel
    n1 = len(texts1)
    channel1_norm = np.mean(tfidf_norm_matrix[:n1].toarray(), axis=0)
    channel2_norm = np.mean(tfidf_norm_matrix[n1:].toarray(), axis=0)
    
    results = []
    for idx, term in enumerate(tfidf_norm.get_feature_names_out()):
        results.append({
            'word': term,
            'norm_tfidf_diff': channel1_norm[idx] - channel2_norm[idx],
            'channel1_norm_tfidf': channel1_norm[idx],
            'channel2_norm_tfidf': channel2_norm[idx]
        })
    
    return pd.DataFrame(results).sort_values('norm_tfidf_diff', ascending=False)

# Normalized TF-IDF calculation function
def normalize_sparse_matrix(matrix):
    """
    Normalize sparse matrix rows to sum to 1
    """
    row_sums = np.array(matrix.sum(axis=1)).ravel()
    row_sums[row_sums == 0] = 1  # Avoid division by zero
    row_indices, col_indices = matrix.nonzero()
    matrix.data /= row_sums[row_indices]
    return matrix

def calculate_normalized_tfidf_comparison(texts1, texts2):
    # Join tokens back into strings
    all_texts = [' '.join(text) for text in texts1 + texts2]
    
    # Calculate TF-IDF with parameters matching our previous analysis
    tfidf_vectorizer = TfidfVectorizer(
        min_df=1,
        max_df=1.0,
        token_pattern=r'(?u)\b\w+\b',
        stop_words=None,
        dtype=np.float32
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)
    
    # Normalize the matrix
    normalized_matrix = normalize_sparse_matrix(tfidf_matrix)
    
    # Get average scores for each channel
    n1 = len(texts1)
    channel1_norm = np.mean(normalized_matrix[:n1].toarray(), axis=0)
    channel2_norm = np.mean(normalized_matrix[n1:].toarray(), axis=0)
    
    results = []
    for idx, term in enumerate(tfidf_vectorizer.get_feature_names_out()):
        results.append({
            'word': term,
            'norm_tfidf_diff': channel1_norm[idx] - channel2_norm[idx],
            'channel1_norm_tfidf': channel1_norm[idx],
            'channel2_norm_tfidf': channel2_norm[idx]
        })
    
    return pd.DataFrame(results).sort_values('norm_tfidf_diff', ascending=False)

# LLR calculation function
def calculate_llr(count1, count2, n1, n2):
    # Add small smoothing constant to prevent division by zero
    epsilon = 1e-10
    
    e1 = n1 * (count1 + count2 + epsilon) / (n1 + n2)
    e2 = n2 * (count1 + count2 + epsilon) / (n1 + n2)
    
    # Add epsilon to prevent log(0)
    return 2 * (count1 * np.log((count1 + epsilon) / (e1 + epsilon)) + 
                count2 * np.log((count2 + epsilon) / (e2 + epsilon)))

# Fighting Words calculation function
def calculate_fighting_words(texts1, texts2, prior=0.01):
    words1 = [word for text in texts1 for word in text]
    words2 = [word for text in texts2 for word in text]
    
    counts1 = Counter(words1)
    counts2 = Counter(words2)
    
    vocab = set(counts1.keys()) | set(counts2.keys())
    
    results = []
    for word in vocab:
        count1 = counts1[word]
        count2 = counts2[word]
        
        p1 = (count1 + prior) / (len(words1) + 2 * prior)
        p2 = (count2 + prior) / (len(words2) + 2 * prior)
        
        se = np.sqrt(p1 * (1 - p1) / len(words1) + p2 * (1 - p2) / len(words2))
        z_score = (p1 - p2) / se
        
        results.append({
            'word': word,
            'z_score': z_score,
            'count_channel1': count1,
            'count_channel2': count2
        })
    
    return pd.DataFrame(results).sort_values('z_score', ascending=False)

# Comparison function
def compare_channels(channel1, channel2, method='llr'):
    texts1 = df_metadata_transcript[df_metadata_transcript['channel_name'] == channel1]['preprocessed_content'].tolist()
    texts2 = df_metadata_transcript[df_metadata_transcript['channel_name'] == channel2]['preprocessed_content'].tolist()
    
    if method == 'tfidf':
        return calculate_tfidf_comparison(texts1, texts2)
    elif method == 'normalized_tfidf':
        return calculate_normalized_tfidf_comparison(texts1, texts2)
    elif method == 'llr':
        words1 = [word for text in texts1 for word in text]
        words2 = [word for text in texts2 for word in text]
        
        counts1 = Counter(words1)
        counts2 = Counter(words2)
        
        results = []
        for word in set(counts1) | set(counts2):
            count1 = counts1[word]
            count2 = counts2[word]
            llr_score = calculate_llr(count1, count2, len(words1), len(words2))
            results.append({
                'word': word,
                'llr_score': llr_score,
                'count_channel1': count1,
                'count_channel2': count2
            })
        
        return pd.DataFrame(results).sort_values('llr_score', ascending=False)
    else:
        return calculate_fighting_words(texts1, texts2)

# Update visualization function
def create_comparison_dashboard():
    def update_comparison(channel1, channel2, method='llr'):
        # Map UI method names to function method names
        method_mapping = {
            'TF-IDF': 'tfidf',
            'Normalized TF-IDF': 'normalized_tfidf',
            'LLR': 'llr',
            'Fighting Words': 'fighting_words'
        }
        
        # Get the correct method key for the comparison function
        method_key = method_mapping[method]
        
        # Get comparison data
        comparison_data = compare_channels(channel1, channel2, method_key)
        
        # Map method names to score columns
        score_column_map = {
            'tfidf': 'tfidf_diff',
            'normalized_tfidf': 'norm_tfidf_diff',
            'llr': 'llr_score',
            'fighting_words': 'z_score'
        }
        
        count_col1_map = {
            'tfidf': 'channel1_tfidf',
            'normalized_tfidf': 'channel1_norm_tfidf',
            'llr': 'count_channel1',
            'fighting_words': 'count_channel1'
        }
        
        count_col2_map = {
            'tfidf': 'channel2_tfidf',
            'normalized_tfidf': 'channel2_norm_tfidf',
            'llr': 'count_channel2',
            'fighting_words': 'count_channel2'
        }
        
        # Get correct column names
        score_column = score_column_map[method_key]
        count_col1 = count_col1_map[method_key]
        count_col2 = count_col2_map[method_key]
        
        fig = go.Figure(data=[
            go.Table(
                header=dict(
                    values=['Word', f'{method} Score', 
                           f'{channel1} Score', f'{channel2} Score'],
                    font=dict(size=12),
                    align='left'
                ),
                cells=dict(
                    values=[
                        comparison_data['word'],
                        comparison_data[score_column].round(3),
                        comparison_data[count_col1].round(3),
                        comparison_data[count_col2].round(3)
                    ],
                    font=dict(size=11),
                    align='left'
                )
            )
        ])
        
        fig.update_layout(
            height=600,
            width=1000,
            title_text=f'{method} Comparison: {channel1} vs {channel2}'
        )
        
        return fig
    
    # Create widgets with updated options
    channels = df_metadata_transcript['channel_name'].unique()
    
    channel_selector1 = widgets.Dropdown(
        options=channels,
        description='Channel 1'
    )
    
    channel_selector2 = widgets.Dropdown(
        options=channels,
        description='Channel 2'
    )
    
    method_selector = widgets.RadioButtons(
        options=['TF-IDF', 'Normalized TF-IDF', 'LLR', 'Fighting Words'],
        description='Method'
    )
    
    interact(
        update_comparison,
        channel1=channel_selector1,
        channel2=channel_selector2,
        method=method_selector
    )

# Run the dashboard
create_comparison_dashboard()

interactive(children=(Dropdown(description='Channel 1', options=('Black Girls Code', 'Codecademy', 'Coding for…