In [1]:
from imdb import IMDb
import re
import pandas as pd
import os
from downloader import download_file_from_google_drive as dl
from IPython.display import clear_output

### Get list of movies to scrape info for

In [2]:
fileid = '1hCCn31z4HM4IzQi59DP-vvUpYKhlvo2S'
datadir = 'data'
fname = os.path.join(datadir, 'data.csv')

if not os.path.exists(datadir):
    os.makedirs(datadir)
    
if not os.path.exists(fname):
    print('Fetching file from the interwebz!')
    dl(fileid, fname)
else:
    print('Data found!')

Data found!


In [3]:
data = pd.read_csv(fname)

In [4]:
data.head()

Unnamed: 0,id,title,url,writers,genres,script
0,1,10 Things I Hate About You,http://www.imsdb.com/scripts/10-Things-I-Hate-...,Karen McCullah Lutz;Kirsten Smith;William Shak...,Comedy;Romance,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...
1,2,12,http://www.imsdb.com/scripts/12.html,Lawrence Bridges,Comedy,\n\n \n 12 - Script\n
2,3,12 and Holding,http://www.imsdb.com/scripts/12-and-Holding.html,Anthony Cipriano,Drama,\n \n \n ...
3,4,12 Monkeys,http://www.imsdb.com/scripts/12-Monkeys.html,David Peoples;Janet Peoples,Drama;Sci-Fi;Thriller,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...
4,5,12 Years a Slave,http://www.imsdb.com/scripts/12-Years-a-Slave....,John Ridley,Drama,\r\n\r\n\r\n 12 YE...


### Define useful functions

In [5]:
def get_movie_info(title, infocat):
    i = IMDb()
    # returns a list of movies w same or similar title; get the first for the best match
    try:
        movie = i.search_movie(title)[0]
    except:
        print('Erorr: did not find movie for ' + title+ '!')
        movie = []
    # only the important information is retrieved by default, so we need to update:
    try:
        i.update(movie, info=infocat)
    except:
        print('Erorr: did not find movie info for ' + title+ '!')
    return movie

In [6]:
def dic_to_pandas(dictionary, keystowrite, dataframe):
    title = str(dictionary['title'].encode('utf-8'))
    dataframe.loc[title] = ''
    for key in keystowrite: #just items in python 3
        try:
            if type(dictionary[key])== list:
                dataframe[key][title] = [x.encode('utf-8') for x in dictionary[key]]
            else:
                dataframe[key][title] = dictionary[key]
        except:
            print(dictionary['title'] + ': ' + key + ' not found')
            dataframe[key][title] = 'NA'
    return dataframe

### Define variables of interest

In [7]:
i = IMDb()
infocat = i.get_movie_infoset() # will get all available info
infocat = [str(x) for x in infocat]
print infocat

['airing', 'akas', 'alternate versions', 'awards', 'connections', 'crazy credits', 'critic reviews', 'episodes', 'external reviews', 'external sites', 'faqs', 'full credits', 'goofs', 'keywords', 'locations', 'main', 'misc sites', 'news', 'official sites', 'parents guide', 'photo sites', 'plot', 'quotes', 'release dates', 'release info', 'reviews', 'sound clips', 'soundtrack', 'synopsis', 'taglines', 'technical', 'trivia', 'tv schedule', 'video clips', 'vote details']


In [8]:
infocat = ['critic reviews', 'locations',
          'vote details', 'keywords', 'plot', 'main']

In [9]:
infolist = ['metascore', 'locations', 
            'arithmetic mean', 'median', 'number of votes',
            'keywords', 'plot', 'rating', 'year', 'votes', 
            'title', 'genres', 'original air date', 'box office']

### Test scraping on a toy example

In [10]:
title = 'Alien 3'

In [11]:
test = get_movie_info(title, infocat)

In [12]:
print(test.infoset2keys)

{'plot': [u'plot', u'synopsis'], 'critic reviews': [u'metacritic url', u'metascore'], 'locations': [u'locations'], 'vote details': [u'arithmetic mean', u'demographics', u'median', u'number of votes'], 'keywords': [u'keywords'], 'main': [u'production managers', u'rating', u'special effects companies', u'distributors', u'music department', u'runtimes', u'special effects', u'thanks', u'year', u'production companies', u'color info', u'composers', u'costume designers', u'votes', u'visual effects', u'title', u'writer', u'editors', u'languages', u'cinematographers', u'writers', u'camera department', u'certificates', u'country codes', u'language codes', u'cover url', u'director', u'casting department', u'editorial department', u'assistant directors', u'sound mix', u'location management', u'genres', u'miscellaneous', u'producers', u'animation department', u'set decorators', u'original air date', u'costume departmen', u'akas', u'aspect ratio', u'sound department', u'stunts', u'kind', u'make up d

In [13]:
df = pd.DataFrame(columns=infolist)

In [14]:
dic_to_pandas(test, infolist, df)
#pd.DataFrame.to_csv(df, 'test.csv')
df.head()

Unnamed: 0,metascore,locations,arithmetic mean,median,number of votes,keywords,plot,rating,year,votes,title,genres,original air date,box office
Alien³,59,"[Blyth Power Station, Northumberland, England,...",6.3,7,"{1: 8042, 2: 6221, 3: 8420, 4: 13436, 5: 28611...","[alien, prison, android, cryogenics, cult-film...","[After her last encounter, Ellen Ripley crash-...",6.5,1992,257615,Alien³,"[Action, Horror, Sci-Fi]",22 May 1992 (USA),"{u'Opening Weekend United States': u'$23,141,1..."


### Scrape movies

In [15]:
titles = data.title
df = pd.DataFrame(columns=infolist)

In [16]:
len(titles)

1118

In [17]:
for i, title in enumerate(titles):
    print('currently scraping ' + title)
    fetchedinfo = get_movie_info(title, infocat)
    if len(fetchedinfo) > 0:
        df = dic_to_pandas(fetchedinfo, infolist, df)
        clear_output(wait=True)
        print(str((i+1)*100/len(titles)) + '% completed'); 

100% completed


In [20]:
df.head()

Unnamed: 0,metascore,locations,arithmetic mean,median,number of votes,keywords,plot,rating,year,votes,title,genres,original air date,box office
10 Things I Hate About You,70,"[Stadium High School - 111 N. E Street, Tacoma...",7.5,7,"{1: 1652, 2: 1429, 3: 2456, 4: 5244, 5: 14191,...","[protective-father, dating, shakespeare-adapta...","[A pretty, popular teenager can't go out on a ...",7.3,1999,272380,10 Things I Hate About You,"[Comedy, Drama, Romance]",31 Mar 1999 (USA),"{u'Opening Weekend United States': u'$8,330,68..."
Twelve,22,"[New York City, New York, USA, Studio City, Lo...",6.0,6,"{1: 517, 2: 382, 3: 611, 4: 1050, 5: 1967, 6: ...","[younger-version-of-character, murder, generat...",[A young drug dealer watches as his high-rolli...,5.7,2010,11574,Twelve,"[Action, Crime, Drama, Thriller]",06 Aug 2010 (USA),"{u'Opening Weekend United States': u'$110,238,..."
12 and Holding,65,"[Rochelle Park, New Jersey, USA, Haledon, New ...",7.7,8,"{1: 79, 2: 31, 3: 56, 4: 69, 5: 226, 6: 610, 7...","[male-objectification, shirtless-wood-chopping...",[After his twin brother is accidentally killed...,7.5,2005,6510,12 and Holding,[Drama],07 Jul 2006 (Sweden),"{u'Opening Weekend United States': u'$11,456, ..."
Twelve Monkeys,74,[Eastern State Penitentiary - 2124 Fairmont Av...,8.0,8,"{1: 4086, 2: 2108, 3: 3096, 4: 5573, 5: 12495,...","[time-travel, virus, mental-institution, under...","[In a future world devastated by disease, a co...",8.0,1995,542653,Twelve Monkeys,"[Mystery, Sci-Fi, Thriller]",05 Jan 1996 (USA),"{u'Opening Weekend United States': u'$14,200,0..."
12 Years a Slave,96,"[Madame John's Legacy, the French Quarter, New...",8.1,8,"{1: 5391, 2: 2109, 3: 2797, 4: 4659, 5: 11224,...","[torture, kidnapping, racism, whipping, slaver...","[In the antebellum United States, Solomon Nort...",8.1,2013,582893,12 Years a Slave,"[Biography, Drama, History]",08 Nov 2013 (USA),"{u'Opening Weekend United States': u'$923,715,..."


In [21]:
pd.DataFrame.to_csv(df, 'IMDBscraped.csv', encoding = 'utf-8')