In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [None]:
'''
Define a movie class for practice with Python objects
'''

class movie:
    def __init__(self, title, year, credits, rating):
        movie.title = title
        movie.year = year
        movie.credits = credits
        movie.rating = rating

In [None]:
'''
Scrape movie info from IMDB, input is an IMDB movie number
Returns a dictionary with movie information
'''

def scrape_movie(movie_num):
    page_link = 'https://www.imdb.com/title/' + str(movie_num)
    page_response = requests.get(page_link, timeout=1000)
    page_content = BeautifulSoup(page_response.content, 'lxml')
    
    credits = page_content.find_all('div', class_='credit_summary_item')
    
    # Title and year
    title_str = page_content.h1.text
    for i, val in enumerate(title_str):
        if val == '(':
            split_index = i-1
            break
    movie_title = title_str[:split_index]
    movie_year_raw = title_str[split_index:]
    year = [i for i in movie_year_raw if i.isdigit()]
    movie_year = int(''.join(year))
    
    # Credits
    credits_dict = {}
    for i in credits:
        key = i.h4.text[:-1]
        value = []
        sub_credit = i.find_all('a')
        for j in sub_credit:
            value.append(j.text)
        credits_dict[key] = value
    
    # Genre
    index = len(page_content.find_all('div', class_='see-more inline canwrap')) - 1
    genres = []
    for i in page_content.find_all('div', class_='see-more inline canwrap')[index].text.split():
        if (i != 'Genres:') and (i != '|'):
            genres.append(i)
    
    # Rating
    imdb_rating = None
    if page_content.find_all('div', class_='ratingValue') != []:
        imdb_rating = float(page_content.find_all('div', class_='ratingValue')[0].find('span').text)
    
    if page_content.find_all('div', class_='metacriticScore score_favorable titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_favorable titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
    elif page_content.find_all('div', class_='metacriticScore score_mixed titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_mixed titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
    elif page_content.find_all('div', class_='metacriticScore score_unfavorable titleReviewBarSubItem') != []:
        meta_rating = page_content.find_all('div', class_='metacriticScore score_unfavorable titleReviewBarSubItem')
        meta_rating = int(meta_rating[0].text)/10
    else:
        meta_rating = None
    
    # Budget and USA Box Office
    div_content = page_content.find_all('div', class_='txt-block')
    
    budget = None
    usa_gross = None
    release_month = None
    country = None
    for i, val in enumerate(div_content):
        if val.h4 is not None:
            if val.h4.text.find('Budget') >= 0:
                budget_index = i
                budget = int(''.join([i for i in div_content[budget_index].text.split()[0] if i.isdigit()]))
            elif val.h4.text.find('Gross USA') >= 0:
                gross_usa_index = i
                usa_gross = int(''.join([i for i in div_content[gross_usa_index].text.split()[2] if i.isdigit()]))
            elif val.h4.text.find('Release Date') >= 0:
                release_index = i
                release_month = div_content[release_index].text.split()[3]
                country = div_content[release_index].text.split()[5]
                country = country.replace('(','')
                country = country.replace(')','')
    
    movie_dict = {}
    movie_dict['title'] = movie_title
    movie_dict['year'] = movie_year
    movie_dict['release_month'] = release_month
    movie_dict['imdb_num'] = movie_num
    movie_dict['box_office_country'] = country
    #movie_dict['credits'] = credits_dict
    movie_dict['star'] = credits_dict['Stars'][0]
    movie_dict['genres'] = genres[0]
    movie_dict['imdb_rating'] = imdb_rating
    movie_dict['meta_rating'] = meta_rating
    movie_dict['budget'] = budget
    movie_dict['usa_gross'] = usa_gross
    
    return movie_dict

In [None]:
def get_actor_movies(actor_num):
    zero_str = ''
    for i in range(7 - len(str(actor_num))):
        zero_str += '0'
    page_link = 'https://www.imdb.com/name/nm0000' + str(actor_num)
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, 'lxml')
    filmography = page_content.find_all('div', {'class': re.compile(r'^.*\b(filmo-row)\b.*$')})

    movies = []
    movie_nums = []
    for i in filmography:
        if (i.text.find('TV Series')<0) and (i.text.find('pre-production')<0) and (i.text.find('post-production')<0):
            movies.append(i.a.text)
            movie_nums.append(i.a['href'].split('/')[2])

    movie_dict_list = []
    for i in movie_nums:
        print(i)
        try:
            movie_dict = scrape_movie(i)
            movie_dict_list.append(movie_dict)
        except:
            print('here')
            continue
            
    return movie_dict_list

In [None]:
# Box office performance = intercept 
    # + B1*month_of_release + B2*weather + B3*meta_rating + B4*unemployment_rate 
    # + B5*genre + B6*budget + B7*is_sequel

In [None]:
#pd.DataFrame([movie_dict_list[0]])

actor_nums = []

movie_df = pd.DataFrame()
for actor_num in actor_nums:
    movie_dict_list = get_actor_movies(actor_num)
    
    for i in movie_dict_list:
        movie_df = movie_df.append(i, ignore_index=True)
        movie_df.dropna(inplace=True)