In [1]:
# Box Office Mojo - daily data on top wide releases yearly in 2021 and 2022

In [2]:
import subprocess
import sys
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])# import data libraries

install("pandas")
install("numpy")
install("xlwt")
install("openpyxl")
install("selenium")
install("webdriver-manager")



In [3]:

import pandas as pd
import numpy as np
from datetime import date, datetime
from time import sleep
import pytz
import random
from openpyxl import *
from openpyxl.utils.dataframe import dataframe_to_rows
from string import *


# import selenium/webscraping libs
from urllib.request import urlopen
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

# Access webdriver for Chrome
driver_path = '/Users/zacharywong/Downloads/chromedriver'
service = Service(driver_path)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
sleepTime = random.randrange(4, 7)

# excel
path = "/Users/zacharywong/github/zacharywong2023/BoxOffice/"
totalFileName = "DecayCurveData.xlsx"
wb = Workbook()

# keep track of day/time
est = pytz.timezone('US/Eastern')
fmt = '%d/%m/%Y %H:%M:%S'
now = datetime.now()
now = now.astimezone(est).strftime(fmt)
print("day/time: " + now)

urlYearly = 'https://www.boxofficemojo.com/year/?ref_=bo_nb_hm_secondarytab'
dailyColumns = ["Date", "DOW", "Rank", "Daily", "%+/-YR", "%+/-LW", "Theaters", "Avg", "To Date", "Day", "Title", "Genre", "Distributor"]
yearlyColumns = ["Rank", "Release", "Gross", "Max Th", "Opening", "% of Total", "Open Th", "Open", "Close", "Distributor"]

dfDailyTable = pd.DataFrame()
dfYearlyTable = pd.DataFrame()

years = [2022, 2021]


def writetoExcel(df, sheetName):
    ws = wb.create_sheet()
    ws.title = sheetName
    rows = dataframe_to_rows(df, index=False, header=True)
    for row in rows:
        ws.append(row)
    wb.save(totalFileName)
        
 

# Filter by In Year Releases and Wide Releases
def clickFilters():
    XPATHinYear= "/html/body/div[1]/main/div/div/div[1]/div[2]/span/form/span/select"
    XPATHwideRelease = "/html/body/div[1]/main/div/div/div[1]/div[3]/span/form/span/select"
    
    try:
        filterInYearSelect = Select(driver.find_element(By.XPATH, XPATHinYear))
    except:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, XPATHinYear)))
        filterInYearSelect = Select(driver.find_element(By.XPATH, XPATHinYear))
        
    filterInYearSelect.select_by_visible_text("In-year releases")
    sleep(sleepTime)
    
    try:
        filterwidereleaseSelect = Select(driver.find_element(By.XPATH, XPATHwideRelease))
    except:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, XPATHwideRelease)))
        filterwidereleaseSelect = Select(driver.find_element(By.XPATH, XPATHwideRelease))
        
    filterwidereleaseSelect.select_by_visible_text("Wide releases")
        

def getMovieLink(index):
    #//*[@id="table"]/div/table[2]/tbody/tr[2]/td[2]/a
    XPATHMovie = "//*[@id='table']/div/table[2]/tbody/tr[{index}]/td[2]/a".format(index = index)
    try: 
        movie = driver.find_element(By.XPATH, XPATHMovie) 
    except: 
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, XPATHMovie)))
        movie = driver.find_element(By.XPATH, XPATHMovie)
    return movie, movie.text

def getRowColumnNumber():
    adjustForHeader = 1
    rowXPATH = "//*[@id='table']/div/table[2]/tbody/tr"
    colXPATH = "//*[@id='table']/div/table[2]/tbody/tr[1]/th"
    colNumber = 0
    rowNumber = len(driver.find_elements(By.XPATH, rowXPATH))-adjustForHeader
    return rowNumber, colNumber


def getTableData():
    tempResult = pd.read_html(driver.page_source)[1]
    return tempResult

def cleanTable(df):
    for column in df:
        df[column] = df[column].replace(['-'], np.nan)
        #if df[column].dtypes == bool:
        #    df.drop(column, inplace = True, axis = 1)
    df = df.dropna(axis = 1, how = 'all')
    df = df.drop(df.columns[[10]], axis = 1)
    return df

def renameYearlyTable(df):
    df.columns = yearlyColumns
    return df

def renameDailyTable(df):
    df.columns = dailyColumns
    return df

def reorderDailyTable(df):
    colsOrdered = ["Distributor", "Title", "Genre", "Date", "DOW", "Rank", "Daily", "%+/-YR", "%+/-LW", "Theaters", "Avg", "To Date", "Day"]
    df = df[colsOrdered]
    return df

def cleanDailyDistributor(df):
    df["Distributor"] = df["Distributor"].str.split('\n').str[0]
    return df

def cleanDate(df):
    df["Date"] = df["Date"].str.split('COVID-19 Pandemic').str[0]
    return df

def getDailyMovieData(movieTitle):
    rowNumber, colNumber = getRowColumnNumber()
    indexList = []
    XPATHGenre = "//*[@id='a-page']/main/div/div[3]/div[4]/div[6]/span[2]"
    XPATHDistributor = "//*[@id='a-page']/main/div/div[3]/div[4]/div[1]/span[2]"
    #            value = (driver.find_element(By.XPATH, XPATHValue)).text
    for number in range(0, rowNumber):
        indexList.append(number)
    
    tempResult = pd.DataFrame(index = indexList)
    tempResult = getTableData()
    tempResult["Title"] = movieTitle
    tempResult["Genre"] = (driver.find_element(By.XPATH, XPATHGenre)).text
    tempResult["Distributor"] = (driver.find_element(By.XPATH, XPATHDistributor)).text
    adjustRow = 2
    adjustCol = 1
    #display(tempResult)
    #for row in range(0, rowNumber):
    #    for col in range(0, colNumber-1+2):
            #print(rowNumber, columns[colNumber])
            #print(row, col)
            #print(rowNumber, colNumber)
    #        if col == 0:
    #            value = movieTitle
    #        elif col == 1: #//*[@id="a-page"]/main/div/div[3]/div[4]/div[6]/span[2]
    #            XPATHValue = "//*[@id='a-page']/main/div/div[3]/div[4]/div[6]/span[2]"
    #            value = (driver.find_element(By.XPATH, XPATHValue)).text
     #       elif col == 2 or col == 3:
     #           XPATHValue = "//*[@id='table']/div/table[2]/tbody/tr[{row}]/td[{col}]/a".format(row = row+adjustRow, col = col-1)
    #            value = (driver.find_element(By.XPATH, XPATHValue)).text
    #        else:
    #            XPATHValue = "//*[@id='table']/div/table[2]/tbody/tr[{row}]/td[{col}]".format(row = row+adjustRow, col = col-1)
    #            value = (driver.find_element(By.XPATH, XPATHValue)).text
    #        #print(rowNumber, columns[colNumber])
    #        tempResult.at[row, columns[col]] = value
    #        #print(tempResult)
    return tempResult
        
                

def getAllDailyData(rowNumber):
    print("Number of Movies:" + str(rowNumber))
    index = 2
    while index < rowNumber+2:
        print("index: " + str(index))
        startNow = datetime.now()
        movieDriver, movieTitle = getMovieLink(index)
        print(movieTitle)
        movieDriver.click()
        getRowColumnNumber()
        try:
            tempResult = getDailyMovieData(movieTitle)
        except:
            print("cannot read movies further - most likely website overload")
        global dfDailyTable
        dfDailyTable = dfDailyTable.append(tempResult)
        index += 1
        
        sleep(sleepTime)
        driver.execute_script("window.history.go(-1)")
        sleep(sleepTime)
        endNow = datetime.now()
        timeElapsed = endNow - startNow
        print("time elapsed: " + str(timeElapsed))


def runYear(year):
    #print(result)
    XPATH_Year = "//*[@class='a-link-normal' and text() = '{year}']".format(year = year)
    try:
        yearListDriver = driver.find_element(By.XPATH, XPATH_Year)
    except:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, XPATH_Year)))
        yearListDriver = driver.find_element(By.XPATH, XPATH_Year)
    yearListDriver.click()
    clickFilters()
    rowNumber, colNumber = getRowColumnNumber()
    
    global dfYearlyTable
    dfYearlyTable = getTableData()
    dfYearlyTable = cleanTable(dfYearlyTable)
    dfYearlyTable = renameYearlyTable(dfYearlyTable)
    
    display(dfYearlyTable)
    
    sheetName = "{year}Summary.xlsx".format(year = year) 
    writetoExcel(dfYearlyTable, sheetName)
    getAllDailyData(rowNumber)
    

def run():
    for year in years:
        try:
            driver.get(urlYearly)
        except: 
            print("Cannot access {url}...quitting now").format(url = urlYearly)
        #XPATH = "/html/body/div[1]/main/div/div/div[3]/div/table[2]/tbody/tr[2]/td[1]/a"
        runYear(year)
        sleep(sleepTime)
        driver.execute_script("window.history.go(-1)")
        
    global dfDailyTable
    display(dfDailyTable)
    sheetName = "DailyMovieData"
    writetoExcel(dfDailyTable, sheetName)
    dfDailyTable = cleanTable(dfDailyTable)
    dfDailyTable = renameDailyTable(dfDailyTable)
    dfDailyTable = reorderDailyTable(dfDailyTable)
    dfDailyTable = cleanDailyDistributor(dfDailyTable)
    dfDailyTable = cleanDate(dfDailyTable)

    display(dfDailyTable)
    
    writetoExcel(dfDailyTable, sheetName)
    wb.save(filename = totalFileName)

startNow = datetime.now()
run()
endNow = datetime.now()
elapsedTime = endNow - startNow 
print("Total Time Elapsed: " + str(elapsedTime))
#result.head()



Current google-chrome version is 103.0.5060
Get LATEST chromedriver version for 103.0.5060 google-chrome
Driver [/Users/zacharywong/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache


day/time: 11/07/2022 11:23:22
