# Collecting Data From IMDB

#### This Notebook uses beautiful soup to scrape a list of films from IMDb advanced search results.
Search terms are:
animated films, time 2001 to 2021, runtime >= 40 mins 

In [1]:
import imdb
from imdb import Cinemagoer
import requests
import bs4
from bs4 import BeautifulSoup

In [2]:
ia = Cinemagoer()

In [3]:
base_url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2001-01-01,2021-12-31&genres=animation&runtime=40,&view=simple&sort=release_date,asc'

In [4]:
# for a given page url on imdb, this function will gather all the links for the searched films and return a list of 
# film ids of the form '/titls/tt########'

def gather_id_urls(url):
    id_urls = []
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    movies= []
    for a in soup.find_all('a', href=True):
        if ('/title/tt' in a['href']) & (a['href'] not in id_urls):
            id_urls.append(a['href'])    
    return id_urls

In [5]:
# given a list of imdb ids of the form '/title/tt######/' returns id of the form ######

def clean_ids_from_urls(id_urls):
    imdb_ids = []
    for ids in id_urls:
        imdb_ids.append(ids[9:-1])
    return imdb_ids

In [6]:
# the first page of search results
id_urls = gather_id_urls(base_url)

# search results between 51 and 3900 in groups of 50 note that film 3901 is not included but 
# does it seems reasonable that it is excluded due to the lack of information about the film.

for page in range(51, 3901, 50):
    url_next = '&start=' + str(page) + '&ref_=adv_nxtpage'
    url = base_url + url_next
    id_urls.extend(gather_id_urls(url))

In [7]:
# sanity check, 3900 films is expected length
len(id_urls)

3900

In [8]:
# create list of all animated films from 2001 to 2021 with runtime of at least 40 mins.

imdb_ids = clean_ids_from_urls(id_urls)

In [9]:
# sanity check, last film in list is "Marriage on Fire"

ia.get_movie(imdb_ids[-1])

<Movie id:15764078[http] title:_Marriage on Fire (2021)_>

In [269]:
imdb_ids.append('8430054')

In [270]:
len(imdb_ids)

3901

In [121]:
import time 

In [230]:
# takes an imdb movie id and returns a tuple of lists containing the movie director(s) and producers

def get_directors_producers_for_movies(imdb_id):
    temp = ia.get_movie(imdb_id)
    if 'director' in temp.keys() and 'producer' in temp.keys():
        return [person.personID for person in temp['director']], [person.personID for person in temp['producer']]
    elif 'director' in temp.keys():
        return [person.personID for person in temp['director']], []
    elif 'producer' in temp.keys():
        return [], [person.personID for person in temp['producer']]
    else:
        return [], []

In [123]:
# takes a list of imdb movie ids and returns a list of tuples containing the movies director(s) and producer(s)

def get_directors_producers(imdb_ids):
    time.sleep(2)
    return [get_directors_producers_for_movies(imdb_id) for imdb_id in imdb_ids]

In [233]:
directors_and_producers = get_directors_producers(imdb_ids)

2022-05-22 16:06:09,484 CRITICAL [imdbpy] C:\Users\jacly\anaconda3\envs\Erdos\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2243275/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': URLError(gaierror(11001, 'getaddrinfo failed'))},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\urllib\request.py", line 1346, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\http\client.py", line 1285, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\http\client.py", line 1331, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Users\jacly\anaconda3\envs\Erdos\lib\http\client.py", line 1280, in endheaders
    self._send_output(message_b

IMDbDataAccessError: {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2243275/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': URLError(gaierror(11001, 'getaddrinfo failed'))}

In [237]:
'8430054' in imdb_ids

False

In [248]:
ia.get_movie_awards('0198781')

{'data': {'awards': [{'award': 'Oscar',
    'year': 2002,
    'result': 'Winner',
    'category': 'Academy Awards, USA',
    'notes': 'Best Music, Original Song',
    'to': [<Person id:0005271[http] name:_Randy Newman_>]},
   {'award': 'Oscar',
    'year': 2002,
    'result': 'Nominee',
    'category': 'Academy Awards, USA',
    'notes': 'Best Music, Original Score',
    'to': [<Person id:0005271[http] name:_Randy Newman_>]},
   {'award': 'Oscar',
    'year': 2002,
    'result': 'Nominee',
    'category': 'Academy Awards, USA',
    'notes': 'Best Sound Editing',
    'to': [<Person id:0003977[http] name:_Gary Rydstrom_>,
     <Person id:0799011[http] name:_Michael Silvers_>]},
   {'award': 'Oscar',
    'year': 2002,
    'result': 'Nominee',
    'category': 'Academy Awards, USA',
    'notes': 'Best Animated Feature',
    'to': [<Person id:0230032[http] name:_Pete Docter_>,
     <Person id:0005124[http] name:_John Lasseter_>]},
   {'award': "BAFTA Children's Award",
    'year': 2002,
    

In [249]:
imdb_ids

['0388130',
 '0273772',
 '0243017',
 '0291559',
 '0277909',
 '0265632',
 '0277955',
 '0166276',
 '0206367',
 '0832449',
 '0306741',
 '0287058',
 '0199898',
 '0831848',
 '1073223',
 '0126029',
 '0363277',
 '0293416',
 '0284394',
 '10956882',
 '0230011',
 '0173840',
 '0287635',
 '0290067',
 '0810831',
 '15864378',
 '0245429',
 '0291046',
 '0291959',
 '0306474',
 '0291350',
 '0310790',
 '0181739',
 '0293854',
 '0275876',
 '0275277',
 '0259929',
 '1612776',
 '0294677',
 '0274919',
 '0298109',
 '2700240',
 '0293849',
 '0367172',
 '0473271',
 '0295152',
 '0296162',
 '0198781',
 '0287547',
 '0405096',
 '0282708',
 '0268397',
 '0318819',
 '0299619',
 '0312941',
 '0322645',
 '0446566',
 '0299040',
 '0306444',
 '0311439',
 '0237687',
 '0296074',
 '0304230',
 '0320282',
 '0280030',
 '0302758',
 '0303318',
 '4130762',
 '4130788',
 '0287335',
 '0308220',
 '0309114',
 '0997084',
 '1147526',
 '0268380',
 '0311618',
 '1171265',
 '0355692',
 '0300576',
 '0814296',
 '0990372',
 '1086280',
 '2180443',
 '

In [273]:
temp['distributors']

[<Company id:0010224[http] name:_20th Century Fox Home Entertainment_>,
 <Company id:0107227[http] name:_The Animation Show_>]