# Downloading Transcripts

This page downloads all transcripts that have not already been downloaded, processing and saving them for use later.

In [68]:
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import urllib
import urllib.request

In [69]:
def TranscriptFolder(section_no, subsection_no):
    return f'section{"{:03d}".format(section_no)}/subsection{"{:03d}".format(subsection_no)}'
    
def TranscriptFilePath(section_no, subsection_no, episode_no):
    return f'{TranscriptFolder(section_no, subsection_no)}/episode{"{:03d}".format(episode_no)}.csv'

In [67]:
def ParseLine(line):
    return [x.strip() for x in line.split(':', 1)] if ':' in line else ['NOSPEAKER', line]

def ProcessTranscript(transcript_url):
    with urllib.request.urlopen(transcript_url) as response:
        main_div = BeautifulSoup(response.read(), 'html.parser').find_all('div', {'class': 'mw-parser-output'})[0]
    current_section = None
    current_section_no = 0
    current_line_no = 0
    transcript = []
    for child in main_div.children:
        if child.name == "h2":
            current_section     = child.text[:-2]
            current_section_no += 1
            current_line_no     = 0
        elif child.name == "p" and current_section is not None:
            current_line_no += 1
            transcript.append([current_section_no, current_line_no, current_section, *ParseLine(child.text)])
    return pd.DataFrame(
        transcript,
        columns = ['section_no', 'line_no', 'section', 'speaker', 'line']
    )
    
def SaveTranscript(section_no, subsection_no, episode_no, transcript_url):
    Path(f'../../data/{TranscriptFolder(section_no, subsection_no)}').mkdir(parents=True, exist_ok=True)
    out_file = TranscriptFilePath(section_no, subsection_no, episode_no)
    ProcessTranscript(transcript_url).to_csv(f'../../data/{out_file}', index = False)
    return out_file

Read in the transcript database, containing episode transcript links and metadata. Make sure the new columns we need, `download_date` and `transcript_file`, are present.

In [71]:
transcript_df                         = pd.read_csv('../../data/transcript_database.csv')
transcript_df['download_date']        = pd.NaT#transcript_df.get('download_date', pd.NaT)
transcript_df['transcript_file']      = transcript_df.get('transcript_file', '')
transcript_df

Unnamed: 0,section_no,subsection_no,episode_no,section,subsection,episode,link,download_date,transcript_file
0,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival_at_Kraghammer,https://criticalrole.fandom.com/wiki/Arrival_a...,NaT,section001/subsection001/episode001.csv
1,1,1,2,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Into_the_Greyspine_Mines,https://criticalrole.fandom.com/wiki/Into_the_...,NaT,section001/subsection001/episode002.csv
2,1,1,3,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Strange_Bedfellows,https://criticalrole.fandom.com/wiki/Strange_B...,NaT,section001/subsection001/episode003.csv
3,1,1,4,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Attack_on_the_Duergar_Warcamp,https://criticalrole.fandom.com/wiki/Attack_on...,NaT,section001/subsection001/episode004.csv
4,1,1,5,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,The_Trick_about_Falling,https://criticalrole.fandom.com/wiki/The_Trick...,NaT,section001/subsection001/episode005.csv
...,...,...,...,...,...,...,...,...,...
464,6,1,13,Miscellaneous,Candela Obscura,Candela_Obscura_Live_-_The_Circle_of_the_Silve...,https://criticalrole.fandom.com/wiki/Candela_O...,NaT,section006/subsection001/episode013.csv
465,6,3,1,Miscellaneous,Undeadwood,"UnDeadwood_Part_I:_Stay_Close,_Reverend",https://criticalrole.fandom.com/wiki/UnDeadwoo...,NaT,section006/subsection003/episode001.csv
466,6,3,2,Miscellaneous,Undeadwood,UnDeadwood_Part_II:_God_Don%27t_Play_Cards,https://criticalrole.fandom.com/wiki/UnDeadwoo...,NaT,section006/subsection003/episode002.csv
467,6,3,3,Miscellaneous,Undeadwood,UnDeadwood_Part_III:_I_Got_My_Wish,https://criticalrole.fandom.com/wiki/UnDeadwoo...,NaT,section006/subsection003/episode003.csv


In [72]:
for index, row in transcript_df.iterrows():
    print(row['section'], "---", row['subsection'], "---", row['episode'])
    if pd.isnull(row['download_date']):
        print('Processing...')
        transcript_file = SaveTranscript(row['section_no'], row['subsection_no'], row['episode_no'], row['link'])
        transcript_df.loc[index, 'download_date'] = pd.Timestamp.now()
        transcript_df.loc[index, 'transcript_file'] = transcript_file
        print(f'Completed processing, saved to {transcript_file}' )
    else:
        print(f'Skipping... processed previously ({row["download_date"]})')

Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- Arrival_at_Kraghammer
Processing...
Completed processing, saved to section001/subsection001/episode001.csv
Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- Into_the_Greyspine_Mines
Processing...
Completed processing, saved to section001/subsection001/episode002.csv
Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- Strange_Bedfellows
Processing...
Completed processing, saved to section001/subsection001/episode003.csv
Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- Attack_on_the_Duergar_Warcamp
Processing...
Completed processing, saved to section001/subsection001/episode004.csv
Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- The_Trick_about_Falling
Processing...
Completed processing, saved to section001/subsection001/episode005.csv
Campaign 1: Vox Machina --- Arc 1: Kraghammer and Vasselheim --- Breaching_the_Emberhold
Processing...
Completed processing, saved

Completed processing, saved to section001/subsection003/episode013.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- The_Kill_Box
Processing...
Completed processing, saved to section001/subsection003/episode014.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- At_Dawn,_We_Plan!
Processing...
Completed processing, saved to section001/subsection003/episode015.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- In_the_Belly_of_the_Beast
Processing...
Completed processing, saved to section001/subsection003/episode016.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- Umbrasyl_(episode)
Processing...
Completed processing, saved to section001/subsection003/episode017.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- Hope
Processing...
Completed processing, saved to section001/subsection003/episode018.csv
Campaign 1: Vox Machina --- Arc 3: The Chroma Conclave --- Duskmeadow
Processing...
Completed processing, saved to section001/su

Completed processing, saved to section001/subsection005/episode004.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- Elysium
Processing...
Completed processing, saved to section001/subsection005/episode005.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- The_Fear_of_Isolation
Processing...
Completed processing, saved to section001/subsection005/episode006.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- The_Endless_Atheneeum
Processing...
Completed processing, saved to section001/subsection005/episode007.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- Scaldseat_(episode)
Processing...
Completed processing, saved to section001/subsection005/episode008.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- The_Core_Anvil
Processing...
Completed processing, saved to section001/subsection005/episode009.csv
Campaign 1: Vox Machina --- Arc 5: Vecna --- The_Ominous_March
Processing...
Completed processing, saved to section001/subsection005/episode010.csv
Campaign 1: Vox Machina --- Arc 5: Vecna ---

Completed processing, saved to section002/subsection002/episode006.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- A_Hole_In_the_Plan
Processing...
Completed processing, saved to section002/subsection002/episode007.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- In_Hot_Water
Processing...
Completed processing, saved to section002/subsection002/episode008.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- The_Diver%27s_Grave
Processing...
Completed processing, saved to section002/subsection002/episode009.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- The_Stowaway
Processing...
Completed processing, saved to section002/subsection002/episode010.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- A_Storm_of_Memories
Processing...
Completed processing, saved to section002/subsection002/episode011.csv
Campaign 2: The Mighty Nein --- Arc 2: The Bad Guys --- The_Second_Seal
Processing...
Completed processing, saved to section002/subsection0

Completed processing, saved to section002/subsection004/episode022.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- Home_is_Where_the_Heart_Is
Processing...
Completed processing, saved to section002/subsection005/episode001.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- Misery_Loves_Company
Processing...
Completed processing, saved to section002/subsection005/episode002.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- With_Great_Power...
Processing...
Completed processing, saved to section002/subsection005/episode003.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- Blessing_in_Disguise
Processing...
Completed processing, saved to section002/subsection005/episode004.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- Family_Shatters
Processing...
Completed processing, saved to section002/subsection005/episode005.csv
Campaign 2: The Mighty Nein --- Arc 5: Family Ties --- The_Fancy_and_the_Fooled
Processing...
Completed processing, saved to

Completed processing, saved to section003/subsection001/episode001.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- Trial_by_Firelight
Processing...
Completed processing, saved to section003/subsection001/episode002.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- The_Trail_and_the_Toll
Processing...
Completed processing, saved to section003/subsection001/episode003.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- On_the_Trail_of_a_Killer
Processing...
Completed processing, saved to section003/subsection001/episode004.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- The_Threat_Between_the_Walls
Processing...
Completed processing, saved to section003/subsection001/episode005.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- Growing_Bonds_and_Teasing_Threads
Processing...
Completed processing, saved to section003/subsection001/episode006.csv
Campaign Three: Bells Hells --- Arc 1: Jrusar --- Behind_the_Curtain
Processing...
Completed processing, saved to section003

Completed processing, saved to section003/subsection003/episode002.csv
Campaign Three: Bells Hells --- Arc 3: Separations and Explorations --- Treacherous_Toys
Processing...
Completed processing, saved to section003/subsection003/episode003.csv
Campaign Three: Bells Hells --- Arc 3: Separations and Explorations --- Hope_Within_History
Processing...
Completed processing, saved to section003/subsection003/episode004.csv
Campaign Three: Bells Hells --- Arc 3: Separations and Explorations --- By_Goat_or_By_Boat
Processing...
Completed processing, saved to section003/subsection003/episode005.csv
Campaign Three: Bells Hells --- Arc 3: Separations and Explorations --- The_Sorrow_of_Molaesmyr
Processing...
Completed processing, saved to section003/subsection003/episode006.csv
Campaign Three: Bells Hells --- Arc 3: Separations and Explorations --- Escape_From_The_Past
Processing...
Completed processing, saved to section003/subsection003/episode007.csv
Campaign Three: Bells Hells --- Arc 3: Sepa

Completed processing, saved to section004/subsection000/episode005.csv
Exandria Unlimited --- Exandria Unlimited --- The_Gift_Among_the_Green
Processing...
Completed processing, saved to section004/subsection000/episode006.csv
Exandria Unlimited --- Exandria Unlimited --- Beyond_the_Heart_City
Processing...
Completed processing, saved to section004/subsection000/episode007.csv
Exandria Unlimited --- Exandria Unlimited --- What_Comes_Next
Processing...
Completed processing, saved to section004/subsection000/episode008.csv
Exandria Unlimited --- Exandria Unlimited --- Exandria_Unlimited:_Kymal,_Part_1
Processing...
Completed processing, saved to section004/subsection000/episode009.csv
Exandria Unlimited --- Exandria Unlimited --- Exandria_Unlimited:_Kymal,_Part_2
Processing...
Completed processing, saved to section004/subsection000/episode010.csv
Exandria Unlimited --- Exandria Unlimited --- Excelsior
Processing...
Completed processing, saved to section004/subsection000/episode011.csv
Ex

Completed processing, saved to section005/subsection000/episode046.csv
Specials --- Specials --- The_Adventures_of_the_Darrington_Brigade
Processing...
Completed processing, saved to section005/subsection000/episode047.csv
Specials --- Specials --- End_of_2019_Fireside_Chat
Processing...
Completed processing, saved to section005/subsection000/episode048.csv
Specials --- Specials --- Cinderbrush:_A_Monsterhearts_Story
Processing...
Completed processing, saved to section005/subsection000/episode049.csv
Specials --- Specials --- Doom_Eternal_One-Shot
Processing...
Completed processing, saved to section005/subsection000/episode050.csv
Specials --- Specials --- Explorer%27s_Guide_to_Wildemount_Q%26A_and_Fireside_Chat_with_Matthew_Mercer
Processing...
Completed processing, saved to section005/subsection000/episode051.csv
Specials --- Specials --- Diablo_One_Shot
Processing...
Completed processing, saved to section005/subsection000/episode052.csv
Specials --- Specials --- The_Elder_Scrolls_On

Completed processing, saved to section006/subsection003/episode004.csv


In [73]:
transcript_df.to_csv('../../data/transcript_database.csv', index = False)

In [74]:
transcript_df

Unnamed: 0,section_no,subsection_no,episode_no,section,subsection,episode,link,download_date,transcript_file
0,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival_at_Kraghammer,https://criticalrole.fandom.com/wiki/Arrival_a...,2024-07-09 12:37:35.639781,section001/subsection001/episode001.csv
1,1,1,2,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Into_the_Greyspine_Mines,https://criticalrole.fandom.com/wiki/Into_the_...,2024-07-09 12:37:35.886419,section001/subsection001/episode002.csv
2,1,1,3,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Strange_Bedfellows,https://criticalrole.fandom.com/wiki/Strange_B...,2024-07-09 12:37:36.346142,section001/subsection001/episode003.csv
3,1,1,4,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Attack_on_the_Duergar_Warcamp,https://criticalrole.fandom.com/wiki/Attack_on...,2024-07-09 12:37:36.628495,section001/subsection001/episode004.csv
4,1,1,5,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,The_Trick_about_Falling,https://criticalrole.fandom.com/wiki/The_Trick...,2024-07-09 12:37:37.091840,section001/subsection001/episode005.csv
...,...,...,...,...,...,...,...,...,...
464,6,1,13,Miscellaneous,Candela Obscura,Candela_Obscura_Live_-_The_Circle_of_the_Silve...,https://criticalrole.fandom.com/wiki/Candela_O...,2024-07-09 12:40:26.431343,section006/subsection001/episode013.csv
465,6,3,1,Miscellaneous,Undeadwood,"UnDeadwood_Part_I:_Stay_Close,_Reverend",https://criticalrole.fandom.com/wiki/UnDeadwoo...,2024-07-09 12:40:26.615051,section006/subsection003/episode001.csv
466,6,3,2,Miscellaneous,Undeadwood,UnDeadwood_Part_II:_God_Don%27t_Play_Cards,https://criticalrole.fandom.com/wiki/UnDeadwoo...,2024-07-09 12:40:26.821617,section006/subsection003/episode002.csv
467,6,3,3,Miscellaneous,Undeadwood,UnDeadwood_Part_III:_I_Got_My_Wish,https://criticalrole.fandom.com/wiki/UnDeadwoo...,2024-07-09 12:40:27.015226,section006/subsection003/episode003.csv
