In [1]:
from bs4 import BeautifulSoup
from random import randint
import requests
import urllib.request
import time
import pandas as pd
import os

In [3]:
class AppURLopener(urllib.request.FancyURLopener):
    version = "Mozilla/5.0"

In [4]:
opener = AppURLopener()

  opener = AppURLopener()


In [36]:
class FormatDataOnePage():
    # def __init__(self):

    def collect_text(self, trs):
        sequences_modern = []
        sequences_original = []
        for line in trs:
            divs_original = line.find_all('div', class_='noFear__line noFear__line--original')
            text_original = " ".join([i.text for i in divs_original])

            divs_modern = line.find_all('div', class_='noFear__line noFear__line--modern')
            text_modern = " ".join([i.text for i in divs_modern])

            if text_original and text_modern:
                sequences_modern.append(text_modern)
                sequences_original.append(text_original)

        return sequences_original, sequences_modern
    
    def make_dataframe(self, trs):
        sequences_original, sequences_modern = self.collect_text(trs)
        df = pd.DataFrame({'original' : sequences_original,
                        'modern' : sequences_modern
                        })
        return df
    

## Hamlet

In [37]:
acts_and_scenes = {1: range(1, 6),
                   2: range(1, 3),
                   3: range(1, 5),
                   4: range(1, 8),
                   5: range(1, 3)}

In [41]:
base_url = 'https://www.sparknotes.com/nofear/shakespeare/hamlet/act-{}-scene-{}/'

In [42]:
formatDataOnePage = FormatDataOnePage()

In [44]:
temp_dfs = []
for act, scenes in acts_and_scenes.items():
    for scene in scenes:
        url = base_url.format(act, scene)
        html = opener.open(url)
        soup = BeautifulSoup(html, 'html.parser')
        lines = soup.find_all('tr')
        df_act = formatDataOnePage.make_dataframe(lines)
        temp_dfs.append(df_act)
        print(f"act {act} scene {scene} done, {len(df_act)} lines")


act 1 scene 1 done, 59 lines
act 1 scene 2 done, 73 lines
act 1 scene 3 done, 27 lines
act 1 scene 4 done, 30 lines
act 1 scene 5 done, 60 lines
act 2 scene 1 done, 37 lines
act 2 scene 2 done, 156 lines
act 3 scene 1 done, 48 lines
act 3 scene 2 done, 133 lines
act 3 scene 3 done, 12 lines
act 3 scene 4 done, 57 lines
act 4 scene 1 done, 11 lines
act 4 scene 2 done, 15 lines
act 4 scene 3 done, 26 lines
act 4 scene 4 done, 17 lines
act 4 scene 5 done, 63 lines
act 4 scene 6 done, 9 lines
act 4 scene 7 done, 41 lines
act 5 scene 1 done, 103 lines
act 5 scene 2 done, 140 lines


In [45]:
df_hamlet = pd.concat(temp_dfs)

In [47]:
df_hamlet.index = range(len(df_hamlet))

In [49]:
df_hamlet.to_csv('books\hamlet.csv')

## Macbeth

In [70]:
class ScrapFullBook():
    def __init__(self, title):
        self.title = title
    
    def make_urls(self):
        self.main_url = 'https://www.sparknotes.com/nofear/shakespeare/{}/'.format(self.title)
        self.text_url = self.main_url + 'act-{}-scene-{}/'

    def soup_from_url(self, url):
        html = opener.open(url)
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    def collect_text(self, trs):
        sequences_modern = []
        sequences_original = []
        for line in trs:
            divs_original = line.find_all('div', class_='noFear__line noFear__line--original')
            text_original = " ".join([i.text for i in divs_original])

            divs_modern = line.find_all('div', class_='noFear__line noFear__line--modern')
            text_modern = " ".join([i.text for i in divs_modern])

            if text_original and text_modern:
                sequences_modern.append(text_modern)
                sequences_original.append(text_original)

        return sequences_original, sequences_modern
    
    def make_dataframe_scene(self, trs):
        sequences_original, sequences_modern = self.collect_text(trs)
        df = pd.DataFrame({'original' : sequences_original,
                        'modern' : sequences_modern
                        })
        return df
    
    def get_acts_and_scenes_number(self):
        full_page_soup = self.soup_from_url(self.main_url)
        acts_html = full_page_soup.find_all('div', class_='texts-landing-page__toc__section')

        acts_and_scenes = {}
        for i in range(len(acts_html)):
            n_max_scenes = len(acts_html[i].find_all('ul', class_='texts-landing-page__toc__section__list'))
            acts_and_scenes[i+1] = range(1, n_max_scenes + 1)
        
        return acts_and_scenes

    def get_all_text(self):
        self.make_urls()
        self.acts_and_scenes = self.get_acts_and_scenes_number()
        temp_dfs = []
        for act, scenes in self.acts_and_scenes.items():
            for scene in scenes:
                url = self.text_url.format(act, scene)
                soup_scene = self.soup_from_url(url)
                lines = soup_scene.find_all('tr')
                df_act = formatDataOnePage.make_dataframe(lines)
                temp_dfs.append(df_act)
                print(f"act {act} scene {scene} done, {len(df_act)} lines")
        
        df_full = pd.concat(temp_dfs)
        df_full.index = range(len(df_full))
        df_full.to_csv(f'books\{self.title}.csv')
        return df_full

In [71]:
scrapMacbeth = ScrapFullBook('macbeth')

In [72]:
df_macbeth = scrapMacbeth.get_all_text()

act 1 scene 1 done, 8 lines
act 1 scene 2 done, 18 lines
act 1 scene 3 done, 50 lines
act 1 scene 4 done, 12 lines
act 1 scene 5 done, 15 lines
act 1 scene 6 done, 7 lines
act 1 scene 7 done, 13 lines
act 2 scene 1 done, 17 lines
act 2 scene 2 done, 29 lines
act 2 scene 3 done, 60 lines
act 2 scene 4 done, 22 lines
act 3 scene 1 done, 34 lines
act 3 scene 2 done, 13 lines
act 3 scene 3 done, 20 lines
act 3 scene 4 done, 54 lines
act 3 scene 5 done, 5 lines
act 3 scene 6 done, 5 lines
act 4 scene 1 done, 55 lines
act 4 scene 2 done, 37 lines
act 4 scene 3 done, 65 lines
act 5 scene 1 done, 27 lines
act 5 scene 2 done, 10 lines
act 5 scene 3 done, 19 lines
act 5 scene 4 done, 10 lines
act 5 scene 5 done, 14 lines
act 5 scene 6 done, 3 lines
act 5 scene 7 done, 12 lines
act 5 scene 8 done, 21 lines


In [76]:
scrapOthello = ScrapFullBook('othello')
df_othello = scrapOthello.get_all_text()

act 1 scene 1 done, 45 lines
act 1 scene 2 done, 31 lines
act 1 scene 3 done, 89 lines
act 2 scene 1 done, 85 lines
act 2 scene 2 done, 1 lines
act 2 scene 3 done, 96 lines
act 3 scene 1 done, 23 lines
act 3 scene 2 done, 3 lines
act 3 scene 3 done, 164 lines
act 3 scene 4 done, 84 lines
act 4 scene 1 done, 136 lines
act 4 scene 2 done, 98 lines
act 4 scene 3 done, 37 lines
act 5 scene 1 done, 74 lines
act 5 scene 2 done, 164 lines


In [83]:

scraplear= ScrapFullBook('lear')
df_lear = scraplear.get_all_text()

act 1 scene 1 done, 85 lines
act 1 scene 2 done, 42 lines
act 1 scene 3 done, 7 lines
act 1 scene 4 done, 112 lines
act 1 scene 5 done, 25 lines
act 2 scene 1 done, 42 lines
act 2 scene 2 done, 58 lines
act 2 scene 3 done, 1 lines
act 2 scene 4 done, 96 lines
act 3 scene 1 done, 11 lines
act 3 scene 2 done, 15 lines
act 3 scene 3 done, 4 lines
act 3 scene 4 done, 56 lines
act 3 scene 5 done, 8 lines
act 3 scene 6 done, 34 lines
act 3 scene 7 done, 53 lines
act 4 scene 1 done, 29 lines
act 4 scene 2 done, 30 lines
act 4 scene 3 done, 21 lines
act 4 scene 4 done, 6 lines
act 4 scene 5 done, 18 lines
act 4 scene 6 done, 86 lines
act 4 scene 7 done, 42 lines
act 5 scene 1 done, 28 lines
act 5 scene 2 done, 6 lines
act 5 scene 3 done, 127 lines
