In [1]:
from imdb import IMDb
import re
import pandas as pd
import os
from IPython.display import clear_output

In [2]:
datadir = '../data/'
if not os.path.exists(datadir):
   os.makedirs(datadir)

In [3]:
errorlogfile = open("scrape_imdb_missinginfo.txt","w") 
datafname = 'data.csv'
resultsfname = 'IMDBscraped.csv'
combinedfname = 'data_wmetrics.csv'

### Get list of movies to scrape info for

In [4]:
data = pd.read_csv(datadir + datafname)

In [5]:
data.head()

Unnamed: 0,id,title,url,writers,genres,script
0,1,10 Things I Hate About You,http://www.imsdb.com/scripts/10-Things-I-Hate-...,Karen McCullah Lutz;Kirsten Smith;William Shak...,Comedy;Romance,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...
1,2,12,http://www.imsdb.com/scripts/12.html,Lawrence Bridges,Comedy,\n\n \n 12 - Script\n
2,3,12 and Holding,http://www.imsdb.com/scripts/12-and-Holding.html,Anthony Cipriano,Drama,\n \n \n ...
3,4,12 Monkeys,http://www.imsdb.com/scripts/12-Monkeys.html,David Peoples;Janet Peoples,Drama;Sci-Fi;Thriller,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...
4,5,12 Years a Slave,http://www.imsdb.com/scripts/12-Years-a-Slave....,John Ridley,Drama,\r\n\r\n\r\n 12 YE...


### Define useful functions

In [6]:
def get_movie_info(title, infocat, errorlog=errorlogfile):
    i = IMDb()
    # returns a list of movies w same or similar title; get the first for the best match
    try:
        movie = i.search_movie(title)[0]
    except:
        errorlogfile.write(title + ': Did not find movie\n')
        movie = []
    # only the important information is retrieved by default, so we need to update:
    try:
        i.update(movie, info=infocat)
    except:
        errorlogfile.write(title + ': id not find movie info\n')
    return movie

In [7]:
def dic_to_pandas(dictionary, keystowrite, dataframe, errorlog=errorlogfile):
    title = str(dictionary['title'].encode('utf-8'))
    dataframe.loc[title] = ''
    for key in keystowrite: #just items in python 3
        try:
            if type(dictionary[key])== list:
                dataframe[key][title] = [x.encode('utf-8') for x in dictionary[key]]
            else:
                dataframe[key][title] = dictionary[key]
        except:
            errorlogfile.write(title + ': Did not find ' + key + '\n')
            dataframe[key][title] = 'NA'
    return dataframe

### Define variables of interest

In [8]:
i = IMDb()
infocat = i.get_movie_infoset() # will get all available info
infocat = [str(x) for x in infocat]
print infocat

['airing', 'akas', 'alternate versions', 'awards', 'connections', 'crazy credits', 'critic reviews', 'episodes', 'external reviews', 'external sites', 'faqs', 'full credits', 'goofs', 'keywords', 'locations', 'main', 'misc sites', 'news', 'official sites', 'parents guide', 'photo sites', 'plot', 'quotes', 'release dates', 'release info', 'reviews', 'sound clips', 'soundtrack', 'synopsis', 'taglines', 'technical', 'trivia', 'tv schedule', 'video clips', 'vote details']


In [9]:
infocat = ['critic reviews', 'locations',
          'vote details', 'keywords', 'plot', 'main']

In [10]:
infolist = ['metascore', 'locations', 
            'arithmetic mean', 'median', 'number of votes',
            'keywords', 'plot', 'rating', 'year', 'votes', 
            'title', 'genres', 'original air date', 'box office']

### Test scraping on a toy example

In [11]:
title = 'Alien 3'

In [12]:
test = get_movie_info(title, infocat)

In [13]:
print(test.infoset2keys)

{'plot': [u'plot', u'synopsis'], 'critic reviews': [u'metacritic url', u'metascore'], 'locations': [u'locations'], 'vote details': [u'arithmetic mean', u'demographics', u'median', u'number of votes'], 'keywords': [u'keywords'], 'main': [u'production managers', u'rating', u'special effects companies', u'distributors', u'music department', u'runtimes', u'special effects', u'thanks', u'year', u'production companies', u'color info', u'composers', u'costume designers', u'votes', u'visual effects', u'title', u'writer', u'editors', u'languages', u'cinematographers', u'writers', u'camera department', u'certificates', u'country codes', u'language codes', u'cover url', u'director', u'casting department', u'editorial department', u'assistant directors', u'sound mix', u'location management', u'genres', u'miscellaneous', u'producers', u'animation department', u'set decorators', u'original air date', u'costume departmen', u'akas', u'aspect ratio', u'sound department', u'stunts', u'kind', u'make up d

In [14]:
df = pd.DataFrame(columns=infolist)

In [15]:
dic_to_pandas(test, infolist, df)
pd.DataFrame.to_csv(df, '../data/test.csv', encoding = 'utf-8')
df.head()

Unnamed: 0,metascore,locations,arithmetic mean,median,number of votes,keywords,plot,rating,year,votes,title,genres,original air date,box office
Alien³,59,"[Blyth Power Station, Northumberland, England,...",6.3,7,"{1: 8042, 2: 6221, 3: 8419, 4: 13436, 5: 28615...","[alien, prison, android, cryogenics, cult-film...","[After her last encounter, Ellen Ripley crash-...",6.5,1992,257624,Alien³,"[Action, Horror, Sci-Fi]",22 May 1992 (USA),"{u'Opening Weekend United States': u'$23,141,1..."


### Scrape movies

In [16]:
titles = data.title
df = pd.DataFrame(columns=infolist)

In [17]:
len(titles)

1118

In [18]:
for i, title in enumerate(titles):
    print('currently scraping ' + title)
    fetchedinfo = get_movie_info(title, infocat)
    if len(fetchedinfo) > 0:
        df = dic_to_pandas(fetchedinfo, infolist, df)
        clear_output(wait=True)
        print(str((i+1)*100/len(titles)) + '% completed'); 

100% completed


In [19]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,metascore,locations,arithmetic mean,median,number of votes,keywords,plot,rating,year,votes,title,genres,original air date,box office
0,70,"[Stadium High School - 111 N. E Street, Tacoma...",7.5,7,"{1: 1652, 2: 1429, 3: 2456, 4: 5244, 5: 14191,...","[protective-father, dating, shakespeare-adapta...","[A pretty, popular teenager can't go out on a ...",7.3,1999,272400,10 Things I Hate About You,"[Comedy, Drama, Romance]",31 Mar 1999 (USA),"{u'Opening Weekend United States': u'$8,330,68..."
1,22,"[New York City, New York, USA, Studio City, Lo...",6.0,6,"{1: 517, 2: 382, 3: 611, 4: 1050, 5: 1967, 6: ...","[younger-version-of-character, murder, generat...",[A young drug dealer watches as his high-rolli...,5.7,2010,11574,Twelve,"[Action, Crime, Drama, Thriller]",06 Aug 2010 (USA),"{u'Opening Weekend United States': u'$110,238,..."
2,65,"[Rochelle Park, New Jersey, USA, Haledon, New ...",7.7,8,"{1: 79, 2: 31, 3: 56, 4: 69, 5: 226, 6: 610, 7...","[male-objectification, shirtless-wood-chopping...",[After his twin brother is accidentally killed...,7.5,2005,6510,12 and Holding,[Drama],07 Jul 2006 (Sweden),"{u'Opening Weekend United States': u'$11,456, ..."
3,74,[Eastern State Penitentiary - 2124 Fairmont Av...,8.0,8,"{1: 4087, 2: 2108, 3: 3096, 4: 5576, 5: 12496,...","[time-travel, virus, mental-institution, under...","[In a future world devastated by disease, a co...",8.0,1995,542681,Twelve Monkeys,"[Mystery, Sci-Fi, Thriller]",05 Jan 1996 (USA),"{u'Opening Weekend United States': u'$14,200,0..."
4,96,"[Madame John's Legacy, the French Quarter, New...",8.1,8,"{1: 5391, 2: 2109, 3: 2798, 4: 4659, 5: 11224,...","[torture, kidnapping, racism, whipping, slaver...","[In the antebellum United States, Solomon Nort...",8.1,2013,582921,12 Years a Slave,"[Biography, Drama, History]",08 Nov 2013 (USA),"{u'Opening Weekend United States': u'$923,715,..."


In [20]:
pd.DataFrame.to_csv(df, datadir + resultsfname, encoding = 'utf-8', index=False)

In [21]:
df = df.rename(columns={"genres": "genres_imbd"})#, "title": "title_imbd"})
data = data.rename(columns={"genres": "genres_imsbd"})#, "title": "title_imsbd"})

In [22]:
result = pd.merge(df, data, on='title')
result.head()

Unnamed: 0,metascore,locations,arithmetic mean,median,number of votes,keywords,plot,rating,year,votes,title,genres_imbd,original air date,box office,id,url,writers,genres_imsbd,script
0,70,"[Stadium High School - 111 N. E Street, Tacoma...",7.5,7,"{1: 1652, 2: 1429, 3: 2456, 4: 5244, 5: 14191,...","[protective-father, dating, shakespeare-adapta...","[A pretty, popular teenager can't go out on a ...",7.3,1999,272400,10 Things I Hate About You,"[Comedy, Drama, Romance]",31 Mar 1999 (USA),"{u'Opening Weekend United States': u'$8,330,68...",1,http://www.imsdb.com/scripts/10-Things-I-Hate-...,Karen McCullah Lutz;Kirsten Smith;William Shak...,Comedy;Romance,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...
1,65,"[Rochelle Park, New Jersey, USA, Haledon, New ...",7.7,8,"{1: 79, 2: 31, 3: 56, 4: 69, 5: 226, 6: 610, 7...","[male-objectification, shirtless-wood-chopping...",[After his twin brother is accidentally killed...,7.5,2005,6510,12 and Holding,[Drama],07 Jul 2006 (Sweden),"{u'Opening Weekend United States': u'$11,456, ...",3,http://www.imsdb.com/scripts/12-and-Holding.html,Anthony Cipriano,Drama,\n \n \n ...
2,96,"[Madame John's Legacy, the French Quarter, New...",8.1,8,"{1: 5391, 2: 2109, 3: 2798, 4: 4659, 5: 11224,...","[torture, kidnapping, racism, whipping, slaver...","[In the antebellum United States, Solomon Nort...",8.1,2013,582921,12 Years a Slave,"[Biography, Drama, History]",08 Nov 2013 (USA),"{u'Opening Weekend United States': u'$923,715,...",5,http://www.imsdb.com/scripts/12-Years-a-Slave....,John Ridley,Drama,\r\n\r\n\r\n 12 YE...
3,82,"[Moab, Utah, USA, Utah, USA, USA]",7.6,8,"{1: 2231, 2: 1258, 3: 2112, 4: 4158, 5: 10471,...","[survival, alone, based-on-autobiography, aron...",[An adventurous mountain climber becomes trapp...,7.6,2010,326676,127 Hours,"[Biography, Drama]",28 Jan 2011 (USA),"{u'Opening Weekend United States': u'$264,851,...",6,http://www.imsdb.com/scripts/127-Hours.html,Simon Beaufoy;Danny Boyle,Adventure;Drama;Thriller,\r\n\r\n \r\n ...
4,47,"[Playa Herradura, Puntarenas, Costa Rica, Pine...",6.5,7,"{1: 534, 2: 283, 3: 598, 4: 1268, 5: 3011, 6: ...","[new-world, renaissance, year-1492, caribbean,...",[Christopher Columbus' discovery of the Americ...,6.5,1992,26326,1492: Conquest of Paradise,"[Adventure, Biography, Drama, History]",09 Oct 1992 (USA),"{u'Budget': u'$47,000,000 (estimated)', u'Cumu...",7,http://www.imsdb.com/scripts/1492-Conquest-of-...,Roslyne Bosch,Adventure;Drama,\n\n\n<b><!--\n</b>if (window!= top)\ntop.loca...


In [23]:
pd.DataFrame.to_csv(result, datadir+combinedfname, encoding = 'utf-8', index=False)