In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [None]:
'''
Define a movie class for practice with Python objects
'''

class movie:
    def __init__(self, title, year, credits, rating):
        movie.title = title
        movie.year = year
        movie.credits = credits
        movie.rating = rating

In [None]:
'''
Scrape movie info from IMDB, input is an IMDB movie number
Returns a dictionary with movie information
'''

def scrape_movie(movie_num, debugging=False):
    page_link = 'https://www.imdb.com/title/' + str(movie_num)
    page_response = requests.get(page_link, timeout=1000)
    page_content = BeautifulSoup(page_response.content, 'lxml')
    
    credits = page_content.find_all('div', class_='credit_summary_item')
    
    # Title and year
    title_str = page_content.h1.text
    for i, val in enumerate(title_str):
        if val == '(':
            split_index = i-1
            break
    movie_title = title_str[:split_index]
    movie_year_raw = title_str[split_index:]
    year = [i for i in movie_year_raw if i.isdigit()]
    movie_year = int(''.join(year))
    
    # Runtime
    try:
        tech_link = 'https://www.imdb.com/title/' + str('tt2345759') + '/technical?ref_=tt_dt_spec'
        tech_response = requests.get(tech_link, timeout=1000)
        tech_content = BeautifulSoup(tech_response.content, 'lxml')
        runtime = int(re.findall(r'\(\d+', tech_content.table.find_all('td')[1].text.strip())[0].replace('(',''))
    except:
        runtime = None
    
    # Parental Guidance
    try:
        parental_guidance = re.findall(r'^.+\b', page_content.find_all('div', class_='subtext')[0].text.strip())
    except:
        parental_guidance = None
    
    # Credits
    credits_dict = {}
    for i in credits:
        key = i.h4.text[:-1]
        value = []
        sub_credit = i.find_all('a')
        for j in sub_credit:
            value.append(j.text)
        credits_dict[key] = value
    
    # Genre
    index = len(page_content.find_all('div', class_='see-more inline canwrap')) - 1
    genres = []
    for i in page_content.find_all('div', class_='see-more inline canwrap')[index].text.split():
        if (i != 'Genres:') and (i != '|'):
            genres.append(i)
    
    # Rating
    imdb_rating = None
    if page_content.find_all('div', class_='ratingValue') != []:
        imdb_rating = float(page_content.find_all('div', class_='ratingValue')[0].find('span').text)
    
    num_critics = 0
    if page_content.find_all('div', class_='metacriticScore score_favorable titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_favorable titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
        num_critics_str = page_content.find('a', {'href': re.compile(r'^externalreviews.*')}).text
        num_critics = int(''.join([i for i in num_critics_str if i.isdigit()]))
    elif page_content.find_all('div', class_='metacriticScore score_mixed titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_mixed titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
        num_critics_str = page_content.find('a', {'href': re.compile(r'^externalreviews.*')}).text
        num_critics = int(''.join([i for i in num_critics_str if i.isdigit()]))
    elif page_content.find_all('div', class_='metacriticScore score_unfavorable titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_unfavorable titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
        num_critics_str = page_content.find('a', {'href': re.compile(r'^externalreviews.*')}).text
        num_critics = int(''.join([i for i in num_critics_str if i.isdigit()]))
    else:
        meta_rating = None
    
    # Awards
    try:
        awards_str = page_content.find('span', {'class': re.compile(r'awards-blurb.*')}).text.strip()
    except:
        awards_str = None
    
    # Budget and USA Box Office
    div_content = page_content.find_all('div', class_='txt-block')
    
    budget = None
    usa_gross = None
    release_month = None
    country = None
    for i, val in enumerate(div_content):
        if val.h4 is not None:
            if val.h4.text.find('Budget') >= 0:
                budget_index = i
                budget = int(''.join([i for i in div_content[budget_index].text.split()[0] if i.isdigit()]))
            elif val.h4.text.find('Gross USA') >= 0:
                gross_usa_index = i
                usa_gross = int(''.join([i for i in div_content[gross_usa_index].text.split()[2] if i.isdigit()]))
            elif val.h4.text.find('Release Date') >= 0:
                release_index = i
                release_month = div_content[release_index].text.split()[3]
                country = div_content[release_index].text.split()[5]
                country = country.replace('(','')
                country = country.replace(')','')
    
    movie_dict = {}
    movie_dict['title'] = movie_title
    movie_dict['year'] = movie_year
    movie_dict['release_month'] = release_month
    movie_dict['imdb_num'] = movie_num
    movie_dict['parental_guidance'] = parental_guidance
    movie_dict['runtime'] = runtime
    movie_dict['box_office_country'] = country
    #movie_dict['credits'] = credits_dict
    movie_dict['star'] = credits_dict['Stars'][0]
    movie_dict['director'] = credits_dict['Director'][0]
    movie_dict['genres'] = genres[0]
    movie_dict['imdb_rating'] = imdb_rating
    movie_dict['meta_rating'] = meta_rating
    movie_dict['num_critics'] = num_critics
    movie_dict['budget'] = budget
    movie_dict['usa_gross'] = usa_gross
    
    if debugging:
        return movie_dict, page_content, div_content, tech_content
    else:
        return movie_dict

In [None]:
def get_actor_movies(actor_num, debugging=False):
    zero_str = ''
    for i in range(7 - len(str(actor_num))):
        zero_str += '0'
    page_link = 'https://www.imdb.com/name/nm0000' + str(actor_num)
    page_response = requests.get(page_link, timeout=1000)
    page_content = BeautifulSoup(page_response.content, 'lxml')
    filmography = page_content.find_all('div', {'class': re.compile(r'^.*\b(filmo-row)\b.*$'),
                                                'id': re.compile(r'^.*\b(actor|actress)\b.*$')})

    movies = []
    movie_nums = []
    for i in filmography:
        if (i.text.find('TV Series')<0) and (i.text.find('pre-production')<0) and (i.text.find('post-production')<0):
            movies.append(i.a.text)
            movie_nums.append(i.a['href'].split('/')[2])

    movie_dict_list = []
    for i in movie_nums:
        print(i)
        try:
            movie_dict = scrape_movie(i)
            movie_dict_list.append(movie_dict)
        except:
            print('here')
            continue
            
    if debugging:
        return movie_dict_list, movies, movie_nums
    else:
        return movie_dict_list

In [None]:
# Box office performance = intercept 
    # + B1*month_of_release + B2*weather + B3*meta_rating + B4*unemployment_rate 
    # + B5*genre + B6*budget + B7*is_sequel + B8*interaction_term(for whether previous was good)
    # + B9*age_of_star + B10*rating_of_movie + B11*past_ROI_of_star + B12*past_critic_of_director
    # + B13*decade

In [None]:
#pd.DataFrame([movie_dict_list[0]])

actor_nums = [129, #Tom Cruise
              210, #Julia Roberts
              413168, #Hugh Jackman
              1083271, #Megan Fox
              158, #Tom Hanks
              149, #Jodie Foster
              216, #Ahhhhhnold
              148, #Harrison Ford
              246, #Bruce Willis
              230, #Sly Stallone
              4266, #Anne Hathaway
              243, #Denzel Washington
              424060, #Scarlett Johansson
              138, #Leo DiCaprio
              1401, #Angelina Jolie
              237, #John Travolta
              658, #Meryl Streep
              212, #Meg Ryan
              288, #Christian Bale
              1191, #Adam Sandler
              1706767, #Jonah Hill
              226, #Will Smith
              113, #Sandra Bullock
              331516, #Ryan Gosling
              425005, #The Rock
              168, #Samuel L Jackson
              199, #Al Pacino
              151, #Morgan Freeman
              553, #Liam Neeson
              93, #Brad Pitt
              123, #George Clooney
              354, #Matt Damon
              1567113, #Jessica Chastain
              204, #Natalie Portman
              1557, #Viggo Mortensen
              120, #Jim Carrey
              195, #Bill Murray
              316079, #Paul Gia
              190, #Matt Mcc
              350453, #Jake G
              332, #Don Cheadle
              205626, #Viola Davis
              156, #Jeff Goldblum
              1475594, #Channing Tatum
              194, #Julianne Moore
              358, #Daniel Day Lewis
              949, #Cate Blanchett
              234, #Charlize Theron
              173, #Nicole Kidman
              2225369, #Jennifer Lawrence
              163, #Dustin Hoffman
              191, #Ewan McGregor
              228, #Kevin Spacey
              128, #Russell Crowe
              564215, #James McAvoy
              1297015, #Emma Stone
              702, #Reese Witherspoon
              179, #Jude Law
              569, #G Paltrow
              982, #Josh Brolin
              182, #JLo
              255, #Ben Affleck
              5028, #Kate Hudson
              177896, #Bradley Cooper
              695435, #Chris Pratt
              245, #Robin Williams
              914612, #Emma Watson
              136797, #Steve Carell
              5562 #Owen Wilson
             ]

In [None]:
movie_df = pd.DataFrame()
for actor_num in actor_nums:
    print('Now on Actor Number: ' + str(actor_num))
    movie_dict_list = get_actor_movies(actor_num)
    
    for i in movie_dict_list:
        movie_df = movie_df.append(i, ignore_index=True)
        movie_df.dropna(inplace=True)

In [None]:
import pickle

# Pickle files

# open a file, where you want to store the data
# file = open('movie_df_additional', 'wb')
# dump information to that file
# pickle.dump(movie_df, file)
# close the file
# file.close()

In [None]:
# Load pickles

# open a file, where you stored the pickled data
file = open('movie_df_to_148_HFord', 'rb')
# dump information to that file
df1 = pickle.load(file)
# close the file
file.close()

# open a file, where you stored the pickled data
file = open('movie_df_to_331516_RGosling', 'rb')
# dump information to that file
df2 = pickle.load(file)
# close the file
file.close()

# open a file, where you stored the pickled data
file = open('movie_df_to_last', 'rb')
# dump information to that file
df3 = pickle.load(file)
# close the file
file.close()

In [None]:
# Combine into one dataframe

final_df = pd.DataFrame()
final_df = final_df.append(df1, ignore_index=True)
final_df = final_df.append(df2, ignore_index=True)
final_df = final_df.append(df3, ignore_index=True)

In [None]:
final_df.drop_duplicates(subset='imdb_num', inplace=True)

In [None]:
final_df.reset_index(inplace=True)

In [None]:
final_df.drop(labels='index', axis=1, inplace=True)

In [None]:
final_df['genres'].value_counts()

In [None]:
# For Debugging
movie_dict, page_content, div_content, tech_content = scrape_movie('tt2345759', debugging=True)
movie_dict