# Setup

## Load in Packages

In [1]:
# import libraries to navigate within directories
import os
import sys
import re

# the usual suspects
import pandas as pd
import numpy as np

# Jupyter-specific imports
from IPython.display import clear_output
%matplotlib inline 

# Suppress warnings (optional)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Access Local Git Repo


In [2]:
# Directory & file hierarchy
proj_dir = os.path.abspath('../')
analysis_dir = os.getcwd()
data_dir = os.path.join(proj_dir,'data')
transcript_dir = os.path.join(data_dir, 'transcripts')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')

# Add helpers to python path
if os.path.join(proj_dir,'analysis','python') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis','python'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

## Import Metadata and Transcript data from repo

### Import Metadata

In [3]:
# Define the path to the data directory relative to your current script
data_dir = os.path.join("..", "..", "data") # go up two directory levels

# Importing data for all channels
all_channels = pd.read_csv(os.path.join(data_dir,"master_metadata.csv"))

# Remove the 'Unnamed: 9' column if it exists
if 'Unnamed: 9' in all_channels.columns:
    all_channels = all_channels.drop(columns=['Unnamed: 9'])

# Make tier an integer
all_channels['Tier (1, 2, or 3)'] = all_channels['Tier (1, 2, or 3)'].astype(int)

# Sort to place BGC at the top
all_channels = all_channels.sort_values(
    by='channel_name',
    key=lambda x: x == 'Black Girls Code',
    ascending=[False]
)

all_channels.head(3) # let's take a peek...

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)"
14067,Black Girls Code,Have You Ever Thought About Working at Google?...,https://www.youtube.com/watch?v=ahQQpYG0Lhk,4/26/24,6:26,199330,5625,4,haveyoueverthoughtaboutworkingatgoogleepisode8...,1
14107,Black Girls Code,Tutorial 4: How to Code Animated Art In JavaSc...,https://www.youtube.com/watch?v=I49CXUIXvM8,4/28/23,11:18,84000,179,8,tutorial4howtocodeanimatedartinjavascriptcodea...,1
14080,Black Girls Code,CODE Along: Win $2500!,https://www.youtube.com/watch?v=fZKyNSH2mmM,8/25/23,2:19,139194,344,18,codealongwin2500.txt,1


### Import Transcripts

In [4]:
def load_txt_files_to_dataframe(root_folder):
    # Initialize an empty DataFrame to hold the content of all .txt files
    all_data = []

    # Traverse the directory tree
    for subdir, _, files in os.walk(root_folder):
        if subdir.endswith("_for_machines"):
            # Extract the channel name (so transcripts can be properly matched in cases of videos with the same name)
            channel_name = os.path.basename(subdir).replace('_transcripts_for_machines', '')

            # Process each .txt file in the folder
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(subdir, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        # Append the content and file path to the list
                        all_data.append({'transcript_filename': file, 'content': content, 'channel_name_processed': channel_name})

    # Convert the list to a DataFrame
    df = pd.DataFrame(all_data)
    return df

transcripts = load_txt_files_to_dataframe(data_dir)

transcripts.head(3) # take a peek... yooooo, Roblox???? 

Unnamed: 0,transcript_filename,content,channel_name_processed
0,anewwaytomakemoneyonrobloxpremiumpayouts.txt,hello and welcome back to another video just w...,alvinblox
1,animationsrobloxscriptingtutorial.txt,how to create and play animations in Roblox st...,alvinblox
2,attributesrobloxscriptingtutorial.txt,in this video I'm going to show you how to use...,alvinblox


# Dataset Pre-processing: Merge Metadata and Transcript Data, then save the .csv

## Merge Metadata and Transcript data:

In [5]:
# Process the channel names in 'all_channels' so that they match the format of those in 'trancripts' df (to allow for merge)
all_channels['channel_name_processed'] = all_channels['channel_name'].str.lower().str.replace(':', '-') # convert to lowercase, also change ':' to '-' (due to kidicode ninja channel)

# Create a dataframe that merges metadata and transcript content
df_metadata_transcript = pd.merge(all_channels, transcripts, on=['transcript_filename','channel_name_processed'], how='left') # merge on basis of filename and channel name

# Drop processed channel name (no longer relevant)
df_metadata_transcript = df_metadata_transcript.drop(columns=['channel_name_processed'])

# View merged df
df_metadata_transcript.head(5)

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
0,Black Girls Code,Have You Ever Thought About Working at Google?...,https://www.youtube.com/watch?v=ahQQpYG0Lhk,4/26/24,6:26,199330,5625,4,haveyoueverthoughtaboutworkingatgoogleepisode8...,1,hello everyone and welcome back to codal along...
1,Black Girls Code,Tutorial 4: How to Code Animated Art In JavaSc...,https://www.youtube.com/watch?v=I49CXUIXvM8,4/28/23,11:18,84000,179,8,tutorial4howtocodeanimatedartinjavascriptcodea...,1,what's up for the encoders welcome back to the...
2,Black Girls Code,CODE Along: Win $2500!,https://www.youtube.com/watch?v=fZKyNSH2mmM,8/25/23,2:19,139194,344,18,codealongwin2500.txt,1,hey what's up everyone I'm Cheyenne and I'm yo...
3,Black Girls Code,Don't Know What Career to Choose? Actress and ...,https://www.youtube.com/watch?v=ohceCkLK8Wo,8/21/23,0:44,1143,40,2,dontknowwhatcareertochooseactressandgamerellab...,1,this question means so much to me because when...
4,Black Girls Code,Watch Actress Ella Balinska Recap Her Career i...,https://www.youtube.com/watch?v=jAYK6vJbhxU,8/19/23,0:48,3493,117,10,watchactressellabalinskarecaphercareerin60seco...,1,60 seconds okay so I started off uh with a cou...


## Ensure no rows have been lost or gained

Check that the lengths of the merged df, the original transcripts df, the original metadata df, are all the same as the number of transcripts in data folder.

In [6]:
# Count the number of files in all the folders ending in '_transcripts' 
total_files = 0

# Walk through the directory tree
for subdir, dirs, files in os.walk(data_dir):
    # Check if the folder name ends with 'for_humans'
    if subdir.endswith('for_humans'):
        # Count the number of files in this folder
        total_files += len(files)

if len(df_metadata_transcript) == len(transcripts) == len(all_channels) == total_files:
    print(f'All lengths the same! There are {len(df_metadata_transcript)} videos with transcripts.')
else:
    print('Discrepency detected. Check for merge errors.')

All lengths the same! There are 16206 videos with transcripts.


## Ensure that rows without transcripts are dropped:

In [7]:
# Drop rows that don't have transcripts
df_metadata_transcript = df_metadata_transcript.dropna(subset = 'content')
df_metadata_transcript.shape

(16206, 11)

## Save "df_metadata_transcript" as .csv

In [8]:
df_metadata_transcript.to_csv('metadata_transcript.csv', index=False)

## Lines of df with wrong transcript_filename entry (Nora note to self: delete later once fixed)

In [9]:
all_channels[all_channels['transcript_filename'] == 'loopswiththebee.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",channel_name_processed
16205,Code.org,Loops with the Bee,https://www.youtube.com/watch?v=QseP5KHrc_k,10/2/15,0:31,477,1,0,loopswiththebee.txt,1,code.org


In [10]:
df_metadata_transcript[df_metadata_transcript['transcript_filename'] == 'loopswiththebee.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
16205,Code.org,Loops with the Bee,https://www.youtube.com/watch?v=QseP5KHrc_k,10/2/15,0:31,477,1,0,loopswiththebee.txt,1,A loop is a repetition of activity. Going into...


In [11]:
df_metadata_transcript[df_metadata_transcript['transcript_filename'] == 'thehourofcodeishere.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
5435,Code.org,The Hour of Code is Here,https://www.youtube.com/watch?v=pNiECaVMStY,12/7/15,0:57,57107,197,0,thehourofcodeishere.txt,1,"Hi, I'm Leigha, and I'm Tanya, and we're lucky..."


In [12]:
df_metadata_transcript[df_metadata_transcript['transcript_filename'] == 'sideprojectsaturday.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
5434,The Coding Train,Side Project Saturday!,https://www.youtube.com/watch?v=6dMRzNaNQz0,12/5/20,2:46:01,21818,609,13,sideprojectsaturday.txt,2,do do so do audio sound check now uh my audio ...


In [13]:
df_metadata_transcript[df_metadata_transcript['transcript_filename'] == 'howtofadebetweenscenesinunity.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
5432,Brackeys,How to Fade Between Scenes in Unity,https://www.youtube.com/watch?v=Oadq-IrOazg,5/20/18,13:28,342868,8550,483,howtofadebetweenscenesinunity.txt,2,thanks for tuning in at braies hello everyone ...


In [14]:
df_metadata_transcript[df_metadata_transcript['transcript_filename'] == '21variablesinp5jsmousexmouseyp5jstutorial.txt']

Unnamed: 0,channel_name,video_title,video_url,publish_date,video_length,views,likes,comments,transcript_filename,"Tier (1, 2, or 3)",content
5433,The Coding Train,"2.1: Variables in p5.js (mouseX, mouseY) - p5....",https://www.youtube.com/watch?v=7A5tKW9HGoM,5/19/21,10:41,103472,1894,78,21variablesinp5jsmousexmouseyp5jstutorial.txt,2,"Hello, variables, p5.js, JavaScript, I'm just ..."
