# Testing Scraping on a Singular URL

In [10]:
import requests
r = requests.get('https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e01')

In [6]:
print(r.text[0:500]) #check if url is correct

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">

<title>David Attenborough's Natural Curiosities (2013) s01e01 Episode Script | SS</title>

<meta name="description" content="David Attenborough's Natural Curiosities (2013) s01e01 - Stretched to the Limit Episode Script. SS is dedicated to The Simpsons and host to thousands of free TV show episode scripts and screencaps, cartoon framegrabs and movie scripts.">

<meta property="og:title" content="David Attenborough's Natural Curiosities (2013)


In [7]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

In [8]:
results = soup.find_all('div', {'class':'scrolling-script-container'}) #grab script bodies
len(results) #check if length makes sense

In [10]:
results[0] #check if Beautiful Soup scraped the correct portion

<div class="scrolling-script-container">


                    			DAVID ATTENBOROUGH:
 The natural world is full
  of extraordinarily shaped creatures,
  but how have the stretched bodies
 of some given them an edge?
  I have had the fortune to meet
  some of the planet's
 most enchanting creatures, but some
  stand out more than others,
 because of their intriguing biology.<br/>
  Our knowledge of some of these creatures
 extends back centuries.<br/>
  Others we have discovered more recently.<br/>
  In this series, I share their stories
  and reveal why they are considered
 natural curiosities.<br/>
  In this programme,
 I investigate two creatures
  that have taken the ordinary
 and made it extraordinary,
  the chameleon that has
 an extra-long tongue to catch prey
   and the giraffe with a neck so long
 it can reach the top of trees.<br/>
  How and why have these animals
 stretched nature to the limit?
   The chameleon is a truly bizarre creature,
  both in its behaviour and its app

In [11]:
fr = results[0]
len(fr.contents)

295

In [12]:
fr.contents #see formatting characters present within data

["\r\n\r\n\r\n                    \t\t\tDAVID ATTENBOROUGH:\r The natural world is full\r  of extraordinarily shaped creatures,\r  but how have the stretched bodies\r of some given them an edge?\r  I have had the fortune to meet\r  some of the planet's\r most enchanting creatures, but some\r  stand out more than others,\r because of their intriguing biology.",
 <br/>,
 '\r  Our knowledge of some of these creatures\r extends back centuries.',
 <br/>,
 '\r  Others we have discovered more recently.',
 <br/>,
 '\r  In this series, I share their stories\r  and reveal why they are considered\r natural curiosities.',
 <br/>,
 '\r  In this programme,\r I investigate two creatures\r  that have taken the ordinary\r and made it extraordinary,\r  the chameleon that has\r an extra-long tongue to catch prey\r   and the giraffe with a neck so long\r it can reach the top of trees.',
 <br/>,
 '\r  How and why have these animals\r stretched nature to the limit?\r   The chameleon is a truly bizarre creat

Here I removed and replaced vestigial formatting that would interfere with the desired format

In [13]:
replaceString = "" # remove various characters to improve formatting
cleaned = BeautifulSoup(str(fr).replace("\t", replaceString))
cleaned = BeautifulSoup(str(cleaned).replace("\r", replaceString))
cleaned = BeautifulSoup(str(cleaned).replace("\n", replaceString))

replaceString = "\n" # replace each <br/> tag with "\n"
cleaned = BeautifulSoup(str(cleaned).replace("<br/>", replaceString))


In [15]:
print (cleaned.text) #examine cleaned contents

                    DAVID ATTENBOROUGH: The natural world is full  of extraordinarily shaped creatures,  but how have the stretched bodies of some given them an edge?  I have had the fortune to meet  some of the planet's most enchanting creatures, but some  stand out more than others, because of their intriguing biology.
  Our knowledge of some of these creatures extends back centuries.
  Others we have discovered more recently.
  In this series, I share their stories  and reveal why they are considered natural curiosities.
  In this programme, I investigate two creatures  that have taken the ordinary and made it extraordinary,  the chameleon that has an extra-long tongue to catch prey   and the giraffe with a neck so long it can reach the top of trees.
  How and why have these animals stretched nature to the limit?   The chameleon is a truly bizarre creature,  both in its behaviour and its appearance unlike anything else on earth.
  So, not surprisingly, it's given rise to all kinds o

In [16]:
soup_string = str(cleaned.text) #convert to string and ensure formatting remains
print(soup_string[:150])

                    DAVID ATTENBOROUGH: The natural world is full  of extraordinarily shaped creatures,  but how have the stretched bodies of some giv


# Actual assembly of corpus using Beautiful Soup

In [1]:
#create list of URLS
lists = ['https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e07',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e08',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e09',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s02e10',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-natural-curiosities-2013&episode=s03e06'
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e07',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e08',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e09',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=bbc-life-2009&episode=s01e10',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=natures-great-events&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e07',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e08',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=frozen-planet-2011&episode=s01e09',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=blue-planet-ii-2017&episode=s01e07',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=planet-earth-ii-2016&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=dynasties-2018&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=dynasties-2018&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=dynasties-2018&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=dynasties-2018&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=dynasties-2018&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e03',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e04',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e05',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e06',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=the-hunt-2015&episode=s01e07',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-conquest-of-the-skies-2015&episode=s01e01',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-conquest-of-the-skies-2015&episode=s01e02',
        'https://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=david-attenboroughs-conquest-of-the-skies-2015&episode=s01e03',
        ]

Loop for assembling corpus of data

In [2]:
#Setup
import requests
from bs4 import BeautifulSoup

corpus = ""

#Text Loop
for link in lists:
    #parse HTML
    #print (link)
    r = requests.get(link)
    soup = BeautifulSoup(r.text, 'html.parser')
    #grab script bodies
    results = soup.find_all('div', {'class':'scrolling-script-container'}) 
    fr = results[0]
    replaceString = "" # remove various characters to improve formatting
    cleaned = BeautifulSoup(str(fr).replace("\t", replaceString))
    cleaned = BeautifulSoup(str(cleaned).replace("\r", replaceString))
    cleaned = BeautifulSoup(str(cleaned).replace("\n", replaceString))
    replaceString = "\n" # replace each <br/> tag with "\n"
    cleaned = BeautifulSoup(str(cleaned).replace("<br/>", replaceString))
    soup_string = str(cleaned.text)  
    corpus += soup_string


In [3]:
len(corpus)

1413476

In [7]:
print (corpus)

1334721

In [9]:
# save corpus to .txt file
print(corpus,  file=open('corpus.txt', 'w'))