# Building the Transcript Database

We want to build a database to categorize all the transcripts available. We want to capture the campaign, arc, and episode information as much as possible so that we have that metadata in the future.

In [51]:
from bs4 import BeautifulSoup
import pandas as pd

import urllib
import urllib.request

In [2]:
transcripts_url = 'https://criticalrole.fandom.com/wiki/Transcripts'

with urllib.request.urlopen(transcripts_url) as response:
  html = response.read()

transcripts = BeautifulSoup(html, 'html.parser')

In [47]:
main_div = transcripts.find_all('div', {'class': 'mw-parser-output'})[0]

In [60]:
current_section       = None
current_section_no    = 0
current_subsection    = None
current_subsection_no = 0
current_episode_no    = 0
transcript_data       = []
for child in main_div.find_all():
    match child.name:
        case 'h2':
            child_span = child.find('span', {'class': 'mw-headline'})
            if child_span is not None:
                current_section       = child_span.text
                current_section_no   += 1
                current_subsection_no = 0
        case 'h3':
            child_span = child.find('span', {'class': 'mw-headline'})
            if child_span is not None:
                current_subsection     = child_span.text
                current_subsection_no += 1
                current_episode_no     = 0
        case 'a':
            if child.text == 'Transcript':
                current_episode_no += 1
                transcript_data.append([
                    current_section_no, current_subsection_no, current_episode_no,
                    current_section, current_subsection, child['href'][6:-11],
                    'https://criticalrole.fandom.com' + child['href']
                ])

transcript_df = pd.DataFrame(
    transcript_data,
    columns = ['section_no', 'subsection_no', 'episode_no', 'section', 'subsection', 'episode', 'link']
)
transcript_df

Unnamed: 0,section_no,subsection_no,episode_no,section,subsection,episode,link
0,1,1,1,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Arrival_at_Kraghammer,https://criticalrole.fandom.com/wiki/Arrival_a...
1,1,1,2,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Into_the_Greyspine_Mines,https://criticalrole.fandom.com/wiki/Into_the_...
2,1,1,3,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Strange_Bedfellows,https://criticalrole.fandom.com/wiki/Strange_B...
3,1,1,4,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,Attack_on_the_Duergar_Warcamp,https://criticalrole.fandom.com/wiki/Attack_on...
4,1,1,5,Campaign 1: Vox Machina,Arc 1: Kraghammer and Vasselheim,The_Trick_about_Falling,https://criticalrole.fandom.com/wiki/The_Trick...
...,...,...,...,...,...,...,...
464,6,1,13,Miscellaneous,Candela Obscura,Candela_Obscura_Live_-_The_Circle_of_the_Silve...,https://criticalrole.fandom.com/wiki/Candela_O...
465,6,3,1,Miscellaneous,Undeadwood,"UnDeadwood_Part_I:_Stay_Close,_Reverend",https://criticalrole.fandom.com/wiki/UnDeadwoo...
466,6,3,2,Miscellaneous,Undeadwood,UnDeadwood_Part_II:_God_Don%27t_Play_Cards,https://criticalrole.fandom.com/wiki/UnDeadwoo...
467,6,3,3,Miscellaneous,Undeadwood,UnDeadwood_Part_III:_I_Got_My_Wish,https://criticalrole.fandom.com/wiki/UnDeadwoo...
