# Collecting Data From IMDB

#### This Notebook uses beautiful soup to scrape a list of films from IMDb advanced search.
Search terms are:
animated films, time 2001 to 2021, runtime >= 40 mins 

In [1]:
import imdb
print(imdb.__version__)

2022.04.25


In [2]:
from imdb import Cinemagoer

In [3]:
ia = Cinemagoer()

In [4]:
ia.search_movie(title='Shrek')

[<Movie id:0126029[http] title:_Shrek (2001)_>,
 <Movie id:2460164[http] title:_Shrek (2001) (VG)_>,
 <Movie id:2828044[http] title:_"VIP People" Shrek (2012)_>,
 <Movie id:0298148[http] title:_Shrek 2 (2004)_>,
 <Movie id:0413267[http] title:_Shrek the Third (2007)_>,
 <Movie id:0892791[http] title:_Shrek Forever After (2010)_>,
 <Movie id:3070936[http] title:_Shrek the Musical (2013)_>,
 <Movie id:9334162[http] title:_Shrek Retold (2018)_>,
 <Movie id:10167324[http] title:_Shrek is Love, Shrek is Life (2014) (V)_>,
 <Movie id:0897387[http] title:_Shrek the Halls (2007)_>,
 <Movie id:6950338[http] title:_"DreamWorks Shrek's Swamp Stories" (2010)_>,
 <Movie id:0360985[http] title:_Shrek: The Ghost of Lord Farquaad (2003) (V)_>,
 <Movie id:15040192[http] title:_Shrek5 (2021) (V)_>,
 <Movie id:6113186[http] title:_Untitled Shrek Reboot (None)_>,
 <Movie id:2486724[http] title:_Shrek's Thrilling Tales (2012) (V)_>,
 <Movie id:10524954[http] title:_"TablePop" Shrek (2019)_>,
 <Movie id:095

In [5]:
ia.get_movie_list('ls027345371')

[<Movie id:0245429[http] title:_Spirited Away (2001)_>,
 <Movie id:2948372[http] title:_Soul (2020)_>,
 <Movie id:2948356[http] title:_Zootopia (2016)_>,
 <Movie id:2096673[http] title:_Inside Out (2015)_>,
 <Movie id:0910970[http] title:_WALL·E (2008)_>,
 <Movie id:0435761[http] title:_Toy Story 3 (2010)_>,
 <Movie id:1979376[http] title:_Toy Story 4 (2019)_>,
 <Movie id:1049413[http] title:_Up (2009)_>,
 <Movie id:2380307[http] title:_Coco (2017)_>,
 <Movie id:8097030[http] title:_Turning Red (2022)_>,
 <Movie id:2262227[http] title:_The Book of Life (2014)_>,
 <Movie id:2953050[http] title:_Encanto (2021)_>,
 <Movie id:0317705[http] title:_The Incredibles (2004)_>,
 <Movie id:3606756[http] title:_Incredibles 2 (2018)_>,
 <Movie id:7979580[http] title:_The Mitchells vs the Machines (2021)_>,
 <Movie id:4633694[http] title:_Spider-Man: Into the Spider-Verse (2018)_>,
 <Movie id:0382932[http] title:_Ratatouille (2007)_>,
 <Movie id:3521164[http] title:_Moana (2016)_>,
 <Movie id:510928

In [6]:
ia.get_movie_critic_reviews('0293416')

{'data': {'metascore': '75',
  'metacritic url': 'https://www.metacritic.com/movie/metropolis-2002?ftag=MCD-06-10aaa1c'},
 'titlesRefs': {},
 'namesRefs': {}}

In [7]:
import requests

In [8]:
print(requests.__version__)

2.27.1


In [9]:
import bs4
from bs4 import BeautifulSoup

In [10]:
base_url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2001-01-01,2021-12-31&genres=animation&runtime=40,&view=simple&sort=release_date,asc'

In [11]:
# for a given page url on imdb, this function will gather all the links for the searched films and return a list of 
# film ids of the form '/titls/tt########'

def gather_id_urls(url):
    id_urls = []
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    movies= []
    for a in soup.find_all('a', href=True):
        if ('/title/tt' in a['href']) & (a['href'] not in id_urls):
            id_urls.append(a['href'])    
    return id_urls

In [12]:
# the first page of search results

id_urls = gather_id_urls(base_url)

In [13]:
# this is what the results look like

print(id_urls)

['/title/tt0388130/', '/title/tt0273772/', '/title/tt0243017/', '/title/tt0291559/', '/title/tt0277909/', '/title/tt0265632/', '/title/tt0277955/', '/title/tt0166276/', '/title/tt0206367/', '/title/tt0832449/', '/title/tt0306741/', '/title/tt0287058/', '/title/tt0199898/', '/title/tt0831848/', '/title/tt1073223/', '/title/tt0126029/', '/title/tt0363277/', '/title/tt0293416/', '/title/tt0284394/', '/title/tt10956882/', '/title/tt0230011/', '/title/tt0173840/', '/title/tt0287635/', '/title/tt0290067/', '/title/tt0810831/', '/title/tt15864378/', '/title/tt0245429/', '/title/tt0291046/', '/title/tt0291959/', '/title/tt0306474/', '/title/tt0291350/', '/title/tt0310790/', '/title/tt0181739/', '/title/tt0293854/', '/title/tt0275876/', '/title/tt0275277/', '/title/tt0259929/', '/title/tt1612776/', '/title/tt0294677/', '/title/tt0274919/', '/title/tt0298109/', '/title/tt2700240/', '/title/tt0293849/', '/title/tt0367172/', '/title/tt0473271/', '/title/tt0295152/', '/title/tt0296162/', '/title/tt

In [None]:
# search results between 51 and 3900 in groups of 50 note that film 3901 is not included but 
# does it seems reasonable that it is excluded due to the lack of information about the film.

for page in range(51, 3901, 50):
    url_next = '&start=' + str(page) + '&ref_=adv_nxtpage'
    url = base_url + url_next
    id_urls.extend(gather_id_urls(url))

In [112]:
# sanity check, 3900 films is expected length
len(id_urls)

3900

In [137]:
# this is a fair number of movies but spot checks look correct

id_urls

['/title/tt0388130/',
 '/title/tt0273772/',
 '/title/tt0243017/',
 '/title/tt0291559/',
 '/title/tt0277909/',
 '/title/tt0265632/',
 '/title/tt0277955/',
 '/title/tt0166276/',
 '/title/tt0206367/',
 '/title/tt0832449/',
 '/title/tt0306741/',
 '/title/tt0287058/',
 '/title/tt0199898/',
 '/title/tt0831848/',
 '/title/tt1073223/',
 '/title/tt0126029/',
 '/title/tt0363277/',
 '/title/tt0293416/',
 '/title/tt0284394/',
 '/title/tt10956882/',
 '/title/tt0230011/',
 '/title/tt0173840/',
 '/title/tt0287635/',
 '/title/tt0290067/',
 '/title/tt0810831/',
 '/title/tt15864378/',
 '/title/tt0245429/',
 '/title/tt0291046/',
 '/title/tt0291959/',
 '/title/tt0306474/',
 '/title/tt0291350/',
 '/title/tt0310790/',
 '/title/tt0181739/',
 '/title/tt0293854/',
 '/title/tt0275876/',
 '/title/tt0275277/',
 '/title/tt0259929/',
 '/title/tt1612776/',
 '/title/tt0294677/',
 '/title/tt0274919/',
 '/title/tt0298109/',
 '/title/tt2700240/',
 '/title/tt0293849/',
 '/title/tt0367172/',
 '/title/tt0473271/',
 '/title

In [138]:
#cleans the begining '/title/tt' and the ending '/' from the ids

imdb_ids = []
for ids in id_urls:
    imdb_ids.append(ids[9:-1])

print(imdb_ids)

['0388130', '0273772', '0243017', '0291559', '0277909', '0265632', '0277955', '0166276', '0206367', '0832449', '0306741', '0287058', '0199898', '0831848', '1073223', '0126029', '0363277', '0293416', '0284394', '10956882', '0230011', '0173840', '0287635', '0290067', '0810831', '15864378', '0245429', '0291046', '0291959', '0306474', '0291350', '0310790', '0181739', '0293854', '0275876', '0275277', '0259929', '1612776', '0294677', '0274919', '0298109', '2700240', '0293849', '0367172', '0473271', '0295152', '0296162', '0198781', '0287547', '0405096', '0282708', '0268397', '0318819', '0299619', '0312941', '0322645', '0446566', '0299040', '0306444', '0311439', '0237687', '0296074', '0304230', '0320282', '0280030', '0302758', '0303318', '4130762', '4130788', '0287335', '0308220', '0309114', '0997084', '1147526', '0268380', '0311618', '1171265', '0355692', '0300576', '0814296', '0990372', '1086280', '2180443', '11644204', '0321566', '0166813', '0299878', '0314166', '0275847', '0328327', '02894

In [141]:
# sanity check, last film in list is "Marriage on Fire"

ia.get_movie('0277955')

<Movie id:0277955[http] title:_Pic-Pic, André et leurs amis (2001)_>

## IMDb ids to Film Titles

In [143]:
str(imdb_ids[6])
ia.get_movie(str(imdb_ids[6]))

<Movie id:0277955[http] title:_Pic-Pic, André et leurs amis (2001)_>

In [160]:
animated_titles = []
for imdb_id in imdb_ids:
    animated_titles.append(ia.get_movie(str(imdb_id)))
    print(animated_titles[-1])

Initial D: Third Stage
Don't Make Trouble!
Waking Life
Wave Twisters
Mutant Aliens
Recess: School's Out
Pic-Pic, André et leurs amis
Monkeybone
The Trumpet of the Swan
One Piece: Clockwork Island Adventure
Doraemon: Nobita and the Winged Braves
Tootletubs & Jyro
Old Master Q 2001
Shin Chan: The Adult Empire Strikes Back
Detective Conan: Countdown to Heaven
Shrek
Malice@Doll
Metropolis
Petit Potam
Mahadena Muththai Golayo Roththai
Atlantis: The Lost Empire
Final Fantasy: The Spirits Within
Pokemon 4Ever: Celebi - Voice of the Forest
Serafín: La película
Digimon Tamers: Battle of Adventurers
Sore Ike! Anpanman: Gomira no hoshi
Spirited Away
Blue Remains
The Happy Cricket from the Amazon
Princess Arete
Millennium Actress
The Living Forest
Osmosis Jones
The Little Bear Movie
10 + 2: The Great Secret
Cowboy Bebop: The Movie
Christmas Carol: The Movie
Short6
Kommando Störtebeker
A szalmabábuk lázadása
A Dog Called Pain
Les contes de la mère poule
The Little Polar Bear
Tryumf pana Kleksa
TV-m

Hanuman
El color de los sentidos
The Three Musketeers
Chicken Little
Lunacy
Disaster!
El guerrero sin nombre
Pettson & Findus - Tomtemaskinen
Olentzero and the Magic Log
Stormy Night
Hoodwinked
Black Jack: Two Doctors in Black
Heidi
Xuxinha and Guto Against the Space Monsters
Thru the Moebius Strip
Slipdream
José Martí, ése soy yo
The Complete Works of Yuri Norstein
Anpanman: the Adventure of Happie
25 pärlor - animerade filmer för folk med humor
Children's Favorites Christmas Treasure
PottSau & WahnWitz
Drift
Christmas in New York
Ali Baba and the Forty Thieves: The Lost Scimitar of Arabia
Turma da Mônica: CineGibi 2
Stanley's Dinosaur Round-Up
Origin: Spirits of the Past
McDull, the Alumni
Children of the Moon
Deep Imagination
The Race Begins
Curious George
Happy Tree Friends: Ski Patrol
Happy Tree Friends
Live Freaky Die Freaky
Blood Tea and Red String
Felix 2 - Der Hase und die verflixte Zeitmaschine
Prince Vladimir
Fimfárum 2
Dougal: American Edition
Robin Hood III, Forever Enemie

2022-05-19 11:23:55,291 CRITICAL [imdbpy] C:\Users\jacly\anaconda3\envs\Erdos\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt15757512/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': URLError(ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\urllib\request.py", line 1346, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\http\client.py", line 1285, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\http\client.py", line 1331, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Users\jacly\anaconda3\envs\Erdos\li

IMDbDataAccessError: {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt15757512/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': URLError(ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))}

In [162]:
len(animated_titles)

382

In [149]:
ia.get_movie('10060')

<Movie id:0010060[http] title:_The Delicious Little Devil (1919)_>

In [159]:
ia.get_movie(str(imdb_ids[166]))

<Movie id:0418867[http] title:_Hammerboy (2003)_>

In [156]:
imdb_ids[165]

'0388473'