In [76]:
from bs4 import BeautifulSoup
import os
import requests

In [87]:
def scrape_tma(url, suffix):
    url_with_suffix = url + suffix + '.html'
    text = ''
    r = requests.get(url_with_suffix)
    soup = BeautifulSoup(r.text, 'html.parser')
    transcript = soup.find('div', class_='entry-content')  # get the transcript, not marginal content
    lines = transcript.find_all(['h4', 'p'])  # h5 is sound effects; ignore.
    this_line = None
    for line in lines:
        # print(line)
        if line.name == 'h4':  # h4 == name; new speaker
            if this_line:
                text += this_line + '\n' # deposit last line so we can start a new one
            try:
                this_line = line.string + ': '
            except TypeError:
                continue
        elif line.name == 'p':  # p == speech
            try:
                this_line += line.string + ' '
            except TypeError:   #more than one child -> line.string is None -> error
                # this happens if there's a tag like <em> within a <p>.
                this_line += ' '.join([child for child in line.children if isinstance(child, str)])
    return text

In [121]:
def scrape_arsparadoxica(url, suffix):
    url_with_suffix = url + suffix
    text = ''
    r = requests.get(url_with_suffix)
    soup = BeautifulSoup(r.text, 'html.parser')
    lines = soup.main.find_all('p')
    out_lines = []
    # this one's easy; each line is a separate <p>, and is already formatted the way we want it to be.
    for line in lines:
        if line.string is None:  # has child, like <em></em>, in addition to normal text
            this_line = []
            for child in line.children:
                if isinstance(child, str):
                    this_line.append(child)
                elif child.string:
                    this_line.append(child.string)
            out_lines.append(' '.join(this_line))
        # if a line starts and ends with [], it's sfx and should not be included
        elif not (line.string.startswith('[') and line.string.endswith(']')):
            out_lines.append(line.string)
    return '\n'.join(out_lines).replace(':  ', ': ')

In [79]:
def scrape_batch(url, suffixes, scrape_fn, dir_name=None):
    if dir_name is None:
        dir_name = scrape_fn.__name__
    os.makedirs(dir_name, exist_ok=True)
    for suffix in suffixes:
        text = scrape_fn(url, suffix)
        text = text.replace('…', '...').replace('’', '\'').replace('“', '"').replace('”', '"').replace('‘', '\'').replace('–','-')
        with open(os.path.join(dir_name, suffix) + '.txt', 'w', encoding='utf-8') as f:
            f.write(text)

In [88]:
tma_nums = ['00' + str(i) for i in range(1, 10)] + ['0' + str(i) for i in range(10, 100)] + [str(i) for i in range(100, 201)]
scrape_batch('https://snarp.github.io/magnus_archives_transcripts/episode/', tma_nums, scrape_tma, dir_name='tma')

In [122]:
# the permissions on their website are weird, so i had to run this twice on a few of the transcripts.
arspara_nums = ['01', '02', '03-1', '03-2'] + ['0' + str(i) for i in range(4,10)] + ['10-1', '10-2'] + [str(i) for i in range(11, 22)] + ['recorder'] + [str(i) for i in range(22, 36)]
scrape_batch('https://arsparadoxica.com/transcript/', arspara_nums, scrape_fn=scrape_arsparadoxica, dir_name='ars_paradoxica')

In [None]:
for f in os.listdir('ars_paradoxica'):
    with open(os.path.join('ars_paradoxica', f), 'r', encoding='utf-8') as fi:
        text = fi.read()
    text = text.replace('…', '...').replace('’', '\'').replace('“', '"').replace('”', '"').replace('‘', '\'').replace('–','-')
    with open(os.path.join('ars_paradoxica', f), 'w', encoding='utf-8') as fi:
        fi.write(text)

In the TMA scripts, note that ARCHIVIST (STATEMENT) != ARCHIVIST; the archivist is reading someone else's writing. I'm going to save them under 'STATEMENT'. Ideally, we'd save each statement under the statement-giver's name, but (1) that's one more thing we have to extract, and (2) almost all characters give only one statement, so that wouldn't be very informative.