# Predict Animation Movies' Revenues

### Scrape from IMDB

In [1]:
from __future__ import print_function, division

In [None]:
# Import packages

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from pprint import pprint
import pandas as pd
import collections
import matplotlib.pyplot as plt
import re
from datetime import datetime
import pickle
import sklearn
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
import numpy as np
from sklearn import linear_model, preprocessing

import seaborn as sns

import pandas as pd

%matplotlib inline

In [None]:
# Sample Url

sample_url = 'http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page={}&ref_=adv_prv'

In [None]:
# Convert links from tag to list

def imdb_find_all_links(url):
    link_list = []
    response = requests.get(url)
    response.status_code
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    for link in soup.find_all('a'): 
        link_list.append(str(link))
    return link_list

In [None]:
# Extract 50 movie links from each page

def imdb_find_movie_link(url):
    title_list = []
    for link in imdb_find_all_links(url):
        if link.find('a href="/title/') != -1 and link.find('vote') == -1 and link.find('plot') == -1:    
            start = link.find('a href="/title/')
            end = link.find('?',start+1)
            title = (link[start+8:end])
            title = 'http://www.imdb.com' + title
            if title not in title_list:
                title_list.append(title)
        else:
            continue
    return title_list

In [None]:
# Add movie title

def movie_title(url_base):
    html = urlopen(url_base)
    soup = BeautifulSoup(html, 'lxml')
    return str(soup.find('title').string)

In [None]:
# Explore movie rating

def movie_rating(url_base):
    html = urlopen(url_base)
    soup = BeautifulSoup(html, 'lxml')
    return str(soup.find('span', class_ = "rating"))

In [None]:
# Explore movie MPAA

def movie_mpaa(url_base):
    html = urlopen(url_base)
    soup = BeautifulSoup(html, 'lxml')
    return str(soup.find('meta', itemprop = "contentRating"))[15:17]

In [None]:
# Explore awards

def movie_awards(url_base):
    temp = []
    html = urlopen(url_base)
    soup = BeautifulSoup(html, 'lxml')
    test = soup.findAll('span', itemprop = "awards")
    for sublist in test:
        temp.append(sublist.getText().replace('\n', '').replace('   ',''))
    return str(temp)

In [None]:
# Explore the 'Details' section
# Extract info about budget, country, gross, language, production co, release date, runtime

def movie_details(url_base):
    # Read Url
    html = urlopen(url_base)
    # Create Soup
    soup = BeautifulSoup(html, 'lxml')
    test = soup.findAll('div',class_='txt-block')
    temp = []
    for sublist in test:
        for phrase in sublist.findAll('h4',class_='inline'):
            label = phrase.getText().replace(':','')
            text = phrase.nextSibling
            if len(text.strip()) == 0:
                next_text = text.nextSibling.getText().strip()
                temp.append((label,next_text))
            else:
                text = text.replace('        \n\n      ','')
                text = text.replace('(USA)','')
                temp.append((label, text.strip()))

    # Convert List of Tuples to Dictionary
    temp = dict(temp)
    return temp

In [None]:
# Compile title, details into data frame
# NOT USED, KEPT FOR REFERENCE

#def compile_info(x):
#    new_df = pd.DataFrame()
#    for n in range(x):
#        for link in all_web_pages[x]:
#            data = pd.DataFrame.from_dict(movie_details(link),orient='index', dtype=None)
#            data = data.transpose()
#            data['Movie Title'] = movie_title(link)
#            data['Rating Score'] = movie_rating(link)
#            data['MPAA'] = movie_mpaa(link)
#            data['Awards'] = movie_awards(link)
#            new_df = new_df.append(data)
#    return new_df

In [None]:
def compile_data_info(x):
    new_df = pd.DataFrame()
    for link in imdb_find_movie_link(x):
        data = pd.DataFrame.from_dict(movie_details(link),orient='index', dtype=None)
        data = data.transpose()
        data['Movie Title'] = movie_title(link)
        data['Rating Score'] = movie_rating(link)
        data['MPAA'] = movie_mpaa(link)
        data['Awards'] = movie_awards(link)
        new_df = new_df.append(data)
    return new_df

#### Render pages into dataframes

In [None]:
imdb_page_1 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=1&ref_=adv_prv')

In [None]:
imdb_page_2 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=2&ref_=adv_prv')

In [None]:
imdb_page_3 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=3&ref_=adv_prv')

In [None]:
imdb_page_4 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=4&ref_=adv_prv')

In [None]:
imdb_page_5 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=5&ref_=adv_prv')

In [None]:
imdb_page_6 = compile_data_info('http://www.imdb.com/search/title?genres=animation&title_type=feature&sort=boxoffice_gross_us,desc&page=6&ref_=adv_prv')

In [None]:
# Combine 6 dataframes into 1

movies_data = pd.DataFrame()
movies_data = movies_data.append(imdb_page_1)
movies_data = movies_data.append(imdb_page_2)
movies_data = movies_data.append(imdb_page_3)
movies_data = movies_data.append(imdb_page_4)
movies_data = movies_data.append(imdb_page_5)
movies_data = movies_data.append(imdb_page_6)

In [None]:
movies_data.head()

In [None]:
# Numer of data points

len(movies_data[movies_data['Country'] == 'USA'])

In [None]:
# Save the data frame into a pickle file

with open('my_data_2.pkl', 'wb') as picklefile:
    pickle.dump(movies_data, picklefile)

In [None]:
# Load pickle file

with open('my_data_2.pkl', 'rb') as picklefile: 
    my_old_data = pickle.load(picklefile)

### Process the Data

In [None]:
# Make a copy of imdb_df

imdb_data = movies_data

In [None]:
# Display dataframe info

imdb_data.info()

In [None]:
# Drop data points outside of USA

imdb_data = imdb_data[imdb_data['Country'] == 'USA']

In [None]:
imdb_data.head()

In [None]:
# Remove contents in parenthesis for release date

for n in range(len(imdb_data['Release Date'])):
    if imdb_data['Release Date'].iloc[n].find('(') != -1:
        position = imdb_data['Release Date'].iloc[n].find('(')
        imdb_data['Release Date'].iloc[n] = imdb_data['Release Date'].iloc[n][:position]
    else:
        continue

In [None]:
# Convert release date to date object and extract year, month and day

imdb_data['Release Date'] = pd.to_datetime(imdb_data['Release Date'])
imdb_data['Release Year'] =imdb_data['Release Date'].dt.year
imdb_data['Release Month'] = imdb_data['Release Date'].dt.month
imdb_data['Release Day'] = imdb_data['Release Date'].dt.day

In [None]:
# Remove contents in brackets for rating score

for n in range(len(imdb_data['Rating Score'])):
    if imdb_data['Rating Score'].iloc[n].find('>') != -1:
        start = imdb_data['Rating Score'].iloc[n].find('>')
        end = imdb_data['Rating Score'].iloc[n].find('<', start+1)
        imdb_data['Rating Score'].iloc[n] = imdb_data['Rating Score'].iloc[n][start+1:end]
    else:
        continue

In [None]:
# Remove contents in parenthesis for opening weekend

for n in range(len(imdb_data['Opening Weekend'])):
    if str(imdb_data['Opening Weekend'].iloc[n]).find('(') != -1:
        position = str(imdb_data['Opening Weekend'].iloc[n]).find('(')
        imdb_data['Opening Weekend'].iloc[n] = str(imdb_data['Opening Weekend'].iloc[n])[:position]
    else:
        continue

In [None]:
# Convert data types
# RUN ONLY ONCE!

imdb_data['Budget'] = imdb_data['Budget'].replace('[\$,]', '', regex=True).astype(float)
imdb_data['Gross'] = imdb_data['Gross'].replace('[\$,]', '', regex=True).astype(float)
imdb_data['Runtime'] = imdb_data['Runtime'].replace('min', '', regex=True).astype(float)
imdb_data['Rating Score'] = imdb_data['Rating Score'].replace('<span class="rating">7.9<span class="ofTen">', '7.9').apply(lambda x: float(x.split('/')[0]))
imdb_data['Opening Weekend'] = imdb_data['Opening Weekend'].replace('[\$,]', '', regex=True).replace('UK', '', regex=True).replace('[\£,]', '', regex=True).astype(float)

In [None]:
# Remove contents in parenthesis for movie title

for n in range(len(imdb_data['Movie Title'])):
    if imdb_data['Movie Title'].iloc[n].find('(') != -1:
        position = imdb_data['Movie Title'].iloc[n].find('(')
        imdb_data['Movie Title'].iloc[n] = imdb_data['Movie Title'].iloc[n][:position]
    else:
        continue

In [None]:
imdb_data.info()

In [None]:
imdb_data.head()

In [None]:
imdb_data.reset_index()

#### Process Categorical Data

In [None]:
# Group months into quarters

imdb_data['Release Month'] = apply[lambda x: x ]

In [None]:
len(imdb_data['Awards'])

In [None]:
a = 'n'
a.find('nominated')

In [None]:
imdb_data['Awards Result'] = ''

In [None]:
imdb_data.ix[0]['Awards'][0]

In [None]:
type(imdb_data['Awards Result'].iloc[0])

In [None]:
import datetime as dt

imdb_data['Release Date Quarter'] = imdb_data['Release Date'].dt.quarter

In [None]:
# NOT USED, KEPT FOR REFERENCE
#imdb_data.loc[imdb_data['Awards'].str.contains('Won'), 'Awards Result'] = 'W'

In [None]:
imdb_data

In [None]:
# Add yes/no column for awards

for n in range(len(imdb_data['Awards'])):
    if imdb_data['Awards'].iloc[n].lower().find('won') != -1 or imdb_data['Awards'].iloc[n].lower().find('Win') != -1:
        imdb_data['Awards Result'].iloc[n] = 'Won'
    elif imdb_data['Awards'].iloc[n].lower().find('nominate') != -1:
        imdb_data['Awards Result'].iloc[n] = 'Nominated'
    else:
        imdb_data['Awards Result'].iloc[n] = 'Not'

In [None]:
# Group by production co and sort by gross to see what production cos are major players

imdb_data.groupby(['Production Co']).sum().sort('Gross', ascending = False).head()

In [None]:
# Group productions cos into large, medium and small based on gross 

Large_Cos = imdb_data.groupby(['Production Co']).sum().sort('Gross', ascending = False).index[0:10]
Medium_Cos = imdb_data.groupby(['Production Co']).sum().sort('Gross', ascending = False).index[10:21]
Small_Cos = imdb_data.groupby(['Production Co']).sum().sort('Gross', ascending = False).index[21:]

In [None]:
imdb_data.iloc[0]['Production Co']

In [None]:
def co_size(data):
    for n in range(len(data)):
        data.iloc[n].loc['Production Co Size'] = ''
        if data.iloc[n].loc['Production Co'] in Large_Cos:
            data.iloc[n].loc['Production Co Size'] = 'Large'
        elif data.iloc[n]['Production Co'] in Medium_Cos:
            data.iloc[n].loc['Production Co Size'] = 'Medium'
        else:
            data.iloc[n].loc['Production Co Size'] = 'Small'

In [None]:
copy = imdb_data

In [None]:
copy.iloc[0].loc['Production Co']

In [None]:
co_size(copy)

In [None]:
copy

In [None]:
copy['Production Co Size']

### Exploratory Data Analysis

In [None]:
# Budget vs Gross Scatterplot

plt.figure(figsize=(10,8))
plt.scatter(imdb_data['Budget'], imdb_data['Gross'])
plt.xlabel('Budget')
plt.ylabel('Domestic Total Gross')
plt.title('Budget vs Domestic Total Gross')

In [None]:
# Rating vs Gross Scatterplot

plt.figure(figsize=(10,8))
plt.scatter(imdb_data['Rating Score'], imdb_data['Gross'])
plt.xlabel('Rating Score')
plt.ylabel('Domestic Total Gross')
plt.title('Rating Score vs Domestic Total Gross')

In [None]:
# Bar plots of production co counts

plt.figure(figsize=(10,8))
imdb_df['Production Co'].value_counts().plot(kind='bar')

### Linear Regression

#### First Model

In [None]:
# Create feature matrix (X) and target vector (y)

y, X = patsy.dmatrices('Y ~ X1 + X2 + X3 + X4 + X5 + X6', data=df, return_type="dataframe")

# Create the model

model = sm.OLS(y, X)

# Fit the model to the training set

fit = model.fit()

# Print summary statistics of the model's performance

fit.summary()

#### Cross Validation

In [None]:
lr = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(X_train, y_train)
# Evaluate the model against the testing data
lr.score(X_test, y_test)

In [None]:
# Lasso

X_scaled = preprocessing.scale(X)

lasso_cv = linear_model.LassoCV(cv = 5, normalize = False, verbose = True, random_state = 42)
lasso_cv.fit(X_scaled, y)

lasso_cv.mse_path_

lasso_cv.coef_