In [4]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests

In [5]:
# 2. url: we start with the 'second' page
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=51&ref_=adv_nxt"

In [6]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [93]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
#soup

In [8]:
iterations = range(1, 631, 50)

for i in iterations:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"
    print(url)

https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=1&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=51&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=101&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=151&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=201&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=251&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=301&ref_=adv_nxt
https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating

Respectful scraping:

There we have it, all the URLs we need! Before starting with the actual scraping, though, there's something we need to note when sending massive, automated requests to websites: it's rude. 

We just have 13 of them, which is not too many, but it's still a good practice to let a few seconds pass in between requests. 

Some pages don't like being scraped and will block your IP if they detect it's sending automated requests. Others might have a small server for the traffic they handle, and sending too many requests might crash the site. The sleep module will help us with that. 

Here's how it works, waiting 2 seconds between each iteration in a for loop:

In [9]:
#we need a few more tools for this one 
from random import randint
import time
time.sleep(2)

In [10]:
#basic use of sleep in for loop 
for i in range(5):
    print(i)
    time.sleep(2)

0
1
2
3
4


In [11]:
#more human use of sleep 

for i in range(5):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " seconds.")
    time.sleep(wait_time)

0
I will sleep for 3 seconds.
1
I will sleep for 2 seconds.
2
I will sleep for 3 seconds.
3
I will sleep for 2 seconds.
4
I will sleep for 4 seconds.


to make this more interactive 
- you can split the below scripts out into separate cells and run them yourself or re-write them in your own python style 

In [12]:
#Assembling the script to send and store multiple requests

pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    time.sleep(wait_time)

Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.


In [92]:
# print the object pages after running the code above, you'll just see the response code messages, but the html code is still accessible and you can parse it the same way we've always done:

#BeautifulSoup(pages[0].content, "html.parser")

In [33]:
# Parse just the first page, for testing purposes
soup = BeautifulSoup(pages[0].content, "html.parser")

# Paste the Selector from the first movie title copied from Chrome Dev Tools
soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a")

# Trim the selection: now it grabs all the titles
title_list = soup.select("div.lister-item-content > h3 > a")

In [34]:
title_list

[<a href="/title/tt0099785/">Kevin - Allein zu Haus</a>,
 <a href="/title/tt0099685/">GoodFellas - Drei Jahrzehnte in der Mafia</a>,
 <a href="/title/tt0099674/">Der Pate 3</a>,
 <a href="/title/tt0102926/">Das Schweigen der Lämmer</a>,
 <a href="/title/tt0100802/">Total Recall - Die totale Erinnerung</a>,
 <a href="/title/tt0104952/">Mein Vetter Winnie</a>,
 <a href="/title/tt0101507/">Boyz n the Hood - Jungs im Viertel</a>,
 <a href="/title/tt0103064/">Terminator 2: Tag der Abrechnung</a>,
 <a href="/title/tt0105323/">Der Duft der Frauen</a>,
 <a href="/title/tt0104691/">Der letzte Mohikaner</a>,
 <a href="/title/tt0099487/">Edward mit den Scherenhänden</a>,
 <a href="/title/tt0099810/">Jagd auf Roter Oktober</a>,
 <a href="/title/tt0099348/">Der mit dem Wolf tanzt</a>,
 <a href="/title/tt0105236/">Reservoir Dogs - Wilde Hunde</a>,
 <a href="/title/tt0104257/">Eine Frage der Ehre</a>,
 <a href="/title/tt0105695/">Erbarmungslos</a>,
 <a href="/title/tt0102138/">JFK: Tatort Dallas</a>,

In [32]:
titles = []
for title in title_list:
    print(title.text)
    
titles_set = {}
titles_set = titles
titles_set

Kevin - Allein zu HausKevin - Allein zu HausKevin - Allein zu HausKevin - Allein zu Haus
GoodFellas - Drei Jahrzehnte in der MafiaGoodFellas - Drei Jahrzehnte in der MafiaGoodFellas - Drei Jahrzehnte in der MafiaGoodFellas - Drei Jahrzehnte in der Mafia
Der Pate 3Der Pate 3Der Pate 3Der Pate 3
Das Schweigen der LämmerDas Schweigen der LämmerDas Schweigen der LämmerDas Schweigen der Lämmer
Total Recall - Die totale ErinnerungTotal Recall - Die totale ErinnerungTotal Recall - Die totale ErinnerungTotal Recall - Die totale Erinnerung
Mein Vetter WinnieMein Vetter WinnieMein Vetter WinnieMein Vetter Winnie
Boyz n the Hood - Jungs im ViertelBoyz n the Hood - Jungs im ViertelBoyz n the Hood - Jungs im ViertelBoyz n the Hood - Jungs im Viertel
Terminator 2: Tag der AbrechnungTerminator 2: Tag der AbrechnungTerminator 2: Tag der AbrechnungTerminator 2: Tag der Abrechnung
Der Duft der FrauenDer Duft der FrauenDer Duft der FrauenDer Duft der Frauen
Der letzte MohikanerDer letzte MohikanerDer let

[]

In [18]:
# Paste the Selector from the first movie title copied from Chrome Dev Tools
#soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content ")

# Trim the selection: now it grabs all the titles
soup.select("div.lister-item-content > p:nth-child(4)")

[<p class="text-muted">
     An eight-year-old troublemaker must protect his house from a pair of burglars when he is accidentally left home alone by his family during Christmas vacation.</p>,
 <p class="text-muted">
     The story of <a href="/name/nm1453737">Henry Hill</a> and his life in the mob, covering his relationship with his wife Karen Hill and his mob partners Jimmy Conway and Tommy DeVito in the Italian-American crime syndicate.</p>,
 <p class="text-muted">
     Follows Michael Corleone, now in his 60s, as he seeks to free his family from crime and find a suitable successor to his empire.</p>,
 <p class="text-muted">
     A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.</p>,
 <p class="text-muted">
     When a man goes for virtual vacation memories of the planet Mars, an unexpected and harrowing series of events forces him to go to the planet for real - or is he

One of the ugliest things about the code above is that the HTML element containing the synopsis does not have any combination of tag and attribute that makes it unique. We've had to use select("p:nth-child(4)") and simply grab the 4th <p> element. Not very elegant... potentially will break... but, for now, it works.

We have noticed how both the title and the synopsis are children of div.lister-item-content. That will make our looping task a bit simpler.

There are many approaches to do this. The one we'll follow is:

Loop through the pages we collected, parse them ("create the soup") and store the parsed pages in a list.
For each parsed page, select the "blocks of HTML elements" that contain all the information of each movie (the title, the synopsis and other stuff).
For each one of the "blocks" we collected in the previous step:
Get the movie titles and store them in a list
Get the synopsis and store them in a list
Here's the code that does that:

In [21]:
pages_parsed = []
titles = []
synopsis = []

for i in range(len(pages)):
    # parse all pages
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    # select only the info about the movies
    movies_html = pages_parsed[i].select("div.lister-item-content")
    # for movie, store titles and reviews into lists
    for j in range(len(movies_html)):
        titles.append(movies_html[j].select("h3 > a")[0].get_text())
        synopsis.append(movies_html[j].select("p:nth-child(4)")[0].get_text().strip())

In [28]:
content = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    soup = BeautifulSoup(response.content, "html.parser")
    content.append(soup)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    time.sleep(wait_time)

Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 3 second/s.


In [57]:
film_title = []
for page in content:
    film_title.extend(page.select("div.lister-item-content > h3 > a"))

In [58]:
#content[1].select("div.lister-item-content > h3 > a")

In [59]:
len(film_title)

631

In [90]:
long_list = []
for film in film_title:
    #print(film.text)
    long_list.append(film.text)

In [91]:
#long_list

In [73]:
synopsis = []
for page in content:
    synopsis.extend(page.select("p:nth-child(4)"))

In [72]:
content[0].select("p:nth-child(4)")

#select("p:nth-child(4)").get_text().strip())

[<p class="text-muted">
     An eight-year-old troublemaker must protect his house from a pair of burglars when he is accidentally left home alone by his family during Christmas vacation.</p>,
 <p class="text-muted">
     The story of <a href="/name/nm1453737">Henry Hill</a> and his life in the mob, covering his relationship with his wife Karen Hill and his mob partners Jimmy Conway and Tommy DeVito in the Italian-American crime syndicate.</p>,
 <p class="text-muted">
     Follows Michael Corleone, now in his 60s, as he seeks to free his family from crime and find a suitable successor to his empire.</p>,
 <p class="text-muted">
     A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.</p>,
 <p class="text-muted">
     When a man goes for virtual vacation memories of the planet Mars, an unexpected and harrowing series of events forces him to go to the planet for real - or is he

In [75]:
len(synopsis)

631

In [83]:
synopsis_list = []
for thing in synopsis:
    synopsis_list.append((thing.text).strip('\n '))

In [84]:
import pandas as pd

all = pd.DataFrame()

In [85]:
all['FiltTitle'] = long_list

In [86]:
all['Synopsis'] = synopsis_list

In [95]:
all.head(25)

Unnamed: 0,FiltTitle,Synopsis
0,Kevin - Allein zu Haus,An eight-year-old troublemaker must protect hi...
1,GoodFellas - Drei Jahrzehnte in der Mafia,The story of Henry Hill and his life in the mo...
2,Der Pate 3,"Follows Michael Corleone, now in his 60s, as h..."
3,Das Schweigen der Lämmer,A young F.B.I. cadet must receive the help of ...
4,Total Recall - Die totale Erinnerung,When a man goes for virtual vacation memories ...
5,Mein Vetter Winnie,Two New Yorkers accused of murder in rural Ala...
6,Boyz n the Hood - Jungs im Viertel,Follows the lives of three young males living ...
7,Terminator 2: Tag der Abrechnung,"A cyborg, identical to the one who failed to k..."
8,Der Duft der Frauen,A prep school student needing money agrees to ...
9,Der letzte Mohikaner,Three trappers protect the daughters of a Brit...


In [97]:
all.tail(25)

Unnamed: 0,FiltTitle,Synopsis
606,Razluchnitsa,Add a Plot
607,Igra na milliony,Two gangs of professional racketeers are engag...
608,Golmaal Radhakrishna,"Radhakrishna, a player, dates two women simult..."
609,État critique,Long métrage documentaire sur les critiques cu...
610,Feldberg,Add a Plot
611,Curious George,Add a Plot
612,Haladhar,Add a Plot
613,"Märkische Heide, märkischer Sand",Add a Plot
614,Lina,Add a Plot
615,"What Ignites Me, Extinguishes Me",Add a Plot
