In [32]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [33]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [34]:
#PARAMETERS
YEAR_START = 2014
YEAR_END = 2014
TYPE = "feature"

In [35]:
# here we access the webpage and download the content using requests
page=requests.get("http://www.imdb.com/search/title?at=0&sort=alpha&title_type="+TYPE+"&year="+str(YEAR_START)+","+str(YEAR_END))

In [36]:
#First, we get the number of results for this particular year
soup = BeautifulSoup(page.text, "html.parser")
NumResults = soup.find("div", attrs={"class": "leftright"}).find_all("div", attrs={"id": "left"})[0].get_text()
#Output at this point is "\n1-50 of 8,476\ntitles.\n"
NumResults = NumResults.split()[2]

NumResults = int("".join(NumResults.split(',')))   #converting a string of "8,476" to an int of 8476
print NumResults

8997


In [37]:
titles = []
for i in xrange(1,NumResults,50):
    page=requests.get("http://www.imdb.com/search/title?at=0&sort=alpha&start="+str(i)+"&title_type="+TYPE+"&year="+str(YEAR_START)+","+str(YEAR_END))
    soup = BeautifulSoup(page.text, "html.parser")
    rows = soup.find("table", attrs={"class": "results"}).find_all("tr")[1:]
    cleaner = lambda r: [int(r[0].get_text()), r[1].get_text(), r[2].get_text(), r[2].find("a").get("href")]
    titles = titles + [(row.find("td", attrs={"class":"title"}).find("a").get("href")) for row in rows]

print len(titles)

8997


In [38]:
urlcache={} #a dict to store the visited urls

In [39]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        # try/except blocks are used whenever the code could generate an exception (e.g. division by zero).
        # In this case we don't know if the page really exists, or even if it does, if we'll be able to reach it.
        try:
            r = requests.get("http://www.imdb.com/%s" % url)

            if r.status_code == 200:
                urlcache[url] = r.text
            else:
                urlcache[url] = 1
        except:
            urlcache[url] = 2
    return urlcache[url]

In [40]:
start = time.time()
for title in titles:
    get_page(title)
print time.time() - start

14735.039


In [41]:
print np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache])# no one or two's
print len(titles)==len(urlcache)#we got all of the urls

0
True


In [45]:
def movie_info(url,page_text):
    info = {}
    info['url'] = url  
    soup = BeautifulSoup(page_text, "html.parser")
    
    box = soup.find("div", attrs={"id": "full_subnav"})
    if box:
        rows = box.find_all("li")

        for row in rows:
            #print row
            #print row.find("a").get_text()
            if row.find("a").get_text() == "Filming Locations":
                #print row.find("a", attrs={"class": "link ghost"})
                if row.find("a", attrs={"class": "link ghost"}):
                    info['location_page'] = "None"
                else:
                    loc_url = url+'locations?ref_=tt_ql_dt_6'
                    try:
                        r = requests.get("http://www.imdb.com/%s" % loc_url)
                        time.sleep(1)
                        if r.status_code == 200:
                            #print 'here'
                            soup2 = BeautifulSoup(r.text,"html.parser")
                            locations = soup2.find_all("div", attrs={"class": "soda"})
                            #print locations
                            temp = []
                            for location in locations:
                                #print location
                                temp.append(location.find("a").get_text().replace("\n", " "))
                            #print temp
                            info['location_page'] = temp   
                        else:
                            info['location_page'] = "fail"
                    except:
                        info['location_page'] = "fail"

    topbar = soup.find("table", attrs={"id": "title-overview-widget-layout"})
    
    #get information from the first info box
    if topbar:
        infobar = topbar.find("div", attrs={"class": "infobar"})
        if infobar:
            if infobar.find("meta", attrs={"itemprop": "contentRating"}):
                info['contentRating'] = infobar.find("meta", attrs={"itemprop": "contentRating"}).get_text().split()[0]
            if infobar.find("time", attrs={"itemprop": "duration"}):
                info['duration'] = infobar.find("time", attrs={"itemprop": "duration"}).get_text().strip()
            genres = infobar.find_all("span", attrs={"itemprop": "genre"})
            temp = []
            for genre in genres:
                temp.append(genre.get_text())
            info['genre'] = temp
            
            #print infobar.find("meta", attrs={"itemprop": "datePublished"}).
            temp = []            
            release_dates = infobar.find_all("a", attrs={"title": "See all release dates"})
            for dates in release_dates:
                temp.append(dates.get_text().replace("\n", " "))                
            info['release_dates'] = temp

    #get name of movie
    if topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"itemprop":"name"}):
        info['name'] = topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"itemprop":"name"}).get_text()

    if topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"class":"nobr"}):
        info['year'] = topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"class":"nobr"}).get_text()[1:-1]

        
    starbox = topbar.find("div", attrs={"class": "star-box-details"})
    if starbox:
        if starbox.find("span", attrs={"itemprop": "ratingValue"}):
            info['user_ratings'] = starbox.find("span", attrs={"itemprop": "ratingValue"}).get_text()
        if starbox.find("span", attrs={"itemprop": "ratingCount"}):
            info['user_ratings_count'] = starbox.find("span", attrs={"itemprop": "ratingCount"}).get_text()        
        if starbox.find("a", attrs={"href": "criticreviews?ref_=tt_ov_rt"}):
            info['critic_ratings'] = starbox.find("a", attrs={"href": "criticreviews?ref_=tt_ov_rt"}).get_text().strip()      
            
    detailsbox = soup.find("div", attrs={"id": "titleDetails"})
    if detailsbox:
        txtblocks = detailsbox.find_all("div", attrs={"class": "txt-block"})
        for block in txtblocks:
            #print block
            if block.find("h4"):
                if block.find("h4").get_text() == "Country:":
                    info['country'] = block.find("a").get_text()
                if block.find("h4").get_text() == "Language:":
                    info['language'] = block.find("a").get_text()
                if block.find("h4").get_text() == "Budget:":
                    info['budget'] = block.get_text().split()[1]
                if block.find("h4").get_text() == "Opening Weekend:":
                    info['opening_weekend'] = block.get_text().split()[2]
                if block.find("h4").get_text() == "Gross:":
                    info['gross'] = block.get_text().split()[1]
                    #print info['gross']
               
    
    #print info
    return info     

#Testing Code
#k = '/title/tt1951264/'
#r = requests.get("http://www.imdb.com"+k)
#v = r.text

#movie_info(k, v)

In [46]:
movie_info_list=[]

for k,v in urlcache.items():
    movie_info_list.append(movie_info(k, v))

In [47]:
movie_info_list

[{'budget': u'\u20ac1,500,000',
  'country': u'Kazakhstan',
  'duration': u'110 min',
  'genre': [u'Drama'],
  'language': u'Russian',
  'location_page': [u'Minsk, Belarus ',
   u'Almaty, Kazakhstan ',
   u'St. Petersburg, Russia '],
  'name': u'Ya ne vernus',
  'release_dates': [u' 1 March 2014 (Russia) '],
  'url': u'/title/tt2637844/',
  'user_ratings': u'6.9',
  'user_ratings_count': u'320',
  'year': u'2014'},
 {'country': u'Turkey',
  'duration': u'75 min',
  'genre': [u'Animation', u'Adventure', u'Comedy'],
  'language': u'Turkish',
  'location_page': 'None',
  'name': u'Rimolar ve Zimolar: Kasabada Baris',
  'release_dates': [u' October 2014 (Turkey) '],
  'url': u'/title/tt4067378/',
  'user_ratings': u'6.7',
  'user_ratings_count': u'22',
  'year': u'2014'},
 {'budget': u'$3,000',
  'country': u'India',
  'duration': u'81 min',
  'genre': [u'Drama', u'Romance'],
  'language': u'Hindi',
  'location_page': [u'West Bengal, India ',
   u'Kolkata, West Bengal, India ',
   u'Howrah

In [48]:
fd = open("tempdata/movieinfo.json","w")
json.dump(movie_info_list, fd)
fd.close()

In [49]:
df = pd.DataFrame(movie_info_list)

In [50]:
print len(movie_info_list)

8997
