In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from datetime import datetime
import seaborn as sns
import pickle
sns.set_style("whitegrid")

Our next task is to get the "opening" cutoff week for each movie based on the threshold from the box office data

# Loading and Cleaning Box Office Data

In [2]:
boxOffice = pickle.load( open( "LeapCleanedBoxofficeData.p", "rb" ) )

titles = list(boxOffice.keys())
stringVal = False
badOrder = 0
for title in titles:
    movieData = boxOffice[title]
    # Remove entries with no weekly gross data, number of theaters, or with fewer than three reported weeks
    if ('Weekly' not in movieData.columns) or ('-' in movieData.Theaters.values) or (True in pd.isna(movieData.Theaters.values)) or (int(max(movieData.Week))<=3) or (movieData.Week.iloc[0]!=1):
        del boxOffice[title]
    # A few movies have special opening weekends which open very small. I've excluded them
    elif (type(movieData.Week.values[0])==str):
        print(title)
        del boxOffice[title]

# pickle.dump( boxOffice, open( "CleanedBoxofficeData.p", "wb" ) )

# Compute the Opening Week Cutoff and Multiplier for each movie

In [3]:
multiplierDataFrame = pd.DataFrame(columns=('title', 'Opening Week', 'Opening Week Number', 'Multiplier'))
i=0
threshold = 0.6
for title in boxOffice:
    if title=='Charlie Bartlett (2008)':
        pass
    cleanTitle = title.split(' (')[0] # Remove Year Tag
    widestWeek = max(boxOffice[title].Theaters)
    startingWeek = 0
    currentWeek = 0
    startingWeek = boxOffice[title][boxOffice[title].Theaters>=threshold*widestWeek].Week.min()
    # weekCutoffs[cleanTitle] = pd.to_datetime(boxOffice[title].at[startingWeek, 'Date'])
    opening = sum(boxOffice[title].Weekly.iloc[0:startingWeek].values)
    total = sum(boxOffice[title].Weekly.values)
    multiplier = total/opening
    multiplierDataFrame.loc[i] = [cleanTitle, pd.to_datetime(boxOffice[title].at[startingWeek, 'Date']), startingWeek, multiplier]
    i+=1
multiplierDataFrame

Unnamed: 0,title,Opening Week,Opening Week Number,Multiplier
0,Alien,1979-06-22,5,2.416036
1,Raging Bull,1980-12-19,6,1.000000
2,Star Wars: Episode V - The Empire Strikes Back,1980-06-20,6,1.590517
3,Modern Problems,1981-12-25,1,3.600396
4,On Golden Pond,1982-02-12,11,3.246197
...,...,...,...,...
6144,Companion,2025-01-31,1,1.669133
6145,Green and Gold,2025-01-31,1,1.561074
6146,Den of Thieves: Pantera,2025-01-10,1,1.839845
6147,Presence,2025-01-24,1,1.518704


In [4]:
# read in review and movie data into data frames
dfs = pd.read_csv('./cleandata/rotten_tomatoes_movie_reviews_clean_scale.csv', index_col=False)
dfm = pd.read_csv('./cleandata/rotten_tomatoes_movies_clean.csv', index_col=False)

rottenMovies = dfm.filter(['id', 'title']) # One line per movie with id, title, and "Release Date"
rottenReviews = dfs.filter(['id', 'creationDate', 'isTopCritic', 'originalScore']) # One line per review with normalized score, etc.
rottenReviews['binaryScore'] = [0 if sS == "NEGATIVE" else 1 for sS in dfs['scoreSentiment']] # Convert sentiment to numerical value
movie_ids_winfo = pd.unique(rottenMovies['id']).tolist() # unique list of movie ids from movie list

rottenReviewsWithTitles = rottenReviews.merge(rottenMovies, how='inner', on='id')
rottenReviewsWithMultipliers = rottenReviewsWithTitles.merge(multiplierDataFrame, how='inner', on='title')
rottenReviewsWithMultipliers['creationDate'] = pd.to_datetime(rottenReviewsWithMultipliers['creationDate'])
rottenReviewsWithMultipliers

Unnamed: 0,id,creationDate,isTopCritic,originalScore,binaryScore,title,Opening Week,Opening Week Number,Multiplier
0,addicted_2014,2019-09-06,False,0.200,0,Addicted,2014-10-10,1,1.855915
1,addicted_2014,2015-01-09,False,0.000,0,Addicted,2014-10-10,1,1.855915
2,addicted_2014,2014-10-16,True,0.425,0,Addicted,2014-10-10,1,1.855915
3,addicted_2014,2014-10-11,True,0.375,0,Addicted,2014-10-10,1,1.855915
4,addicted_2014,2014-10-10,True,0.300,0,Addicted,2014-10-10,1,1.855915
...,...,...,...,...,...,...,...,...,...
469812,thor_love_and_thunder,2022-07-05,False,0.700,1,Thor: Love and Thunder,2022-07-08,1,1.832940
469813,thor_love_and_thunder,2022-07-05,False,0.800,1,Thor: Love and Thunder,2022-07-08,1,1.832940
469814,thor_love_and_thunder,2022-07-05,False,0.800,1,Thor: Love and Thunder,2022-07-08,1,1.832940
469815,thor_love_and_thunder,2022-07-05,True,0.250,0,Thor: Love and Thunder,2022-07-08,1,1.832940


In [43]:
earlyReviewsWithMultipliers = rottenReviewsWithMultipliers.loc[rottenReviewsWithMultipliers['creationDate']<=rottenReviewsWithMultipliers['Opening Week']]
earlyTopReviewsWithMultipliers = rottenReviewsWithMultipliers.loc[(rottenReviewsWithMultipliers['creationDate']<=rottenReviewsWithMultipliers['Opening Week']) \
                                                                   & (rottenReviewsWithMultipliers.isTopCritic)]

np.float64(0.911504424778761)

In [50]:
earlyReviewsWithMultipliers.filter(['title', 'originalScore', 'binaryScore']).groupby(['title']).mean()

Unnamed: 0_level_0,originalScore,binaryScore
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71,0.779367,0.974684
10 Cloverfield Lane,0.752895,0.905263
102 Dalmatians,0.493200,0.320000
102 Not Out,0.560000,0.600000
12 Mighty Orphans,0.584677,0.580645
...,...,...
Zookeeper,0.358036,0.142857
Zoolander,0.619643,0.571429
Zoom,0.380000,0.000000
Zootopia,0.798129,0.967742


In [None]:
# earlyReviewsWithMultipliers[earlyReviewsWithMultipliers.title=='King Richard'].originalScore.mean()
earlyTopReviewsWithMultipliers[earlyTopReviewsWithMultipliers.title=='King Richard']

Unnamed: 0,id,creationDate,isTopCritic,originalScore,binaryScore,title,Opening Week,Opening Week Number,Multiplier
43736,king_richard,2021-11-19,True,0.875,1,King Richard,2021-11-19,1,1.873934
43748,king_richard,2021-11-19,True,0.875,1,King Richard,2021-11-19,1,1.873934
43749,king_richard,2021-11-19,True,0.8,1,King Richard,2021-11-19,1,1.873934
43751,king_richard,2021-11-19,True,0.8,1,King Richard,2021-11-19,1,1.873934
43756,king_richard,2021-11-19,True,0.875,1,King Richard,2021-11-19,1,1.873934
43758,king_richard,2021-11-19,True,0.625,1,King Richard,2021-11-19,1,1.873934
43759,king_richard,2021-11-19,True,0.825,1,King Richard,2021-11-19,1,1.873934
43761,king_richard,2021-11-19,True,0.875,1,King Richard,2021-11-19,1,1.873934
43766,king_richard,2021-11-18,True,0.75,1,King Richard,2021-11-19,1,1.873934
43767,king_richard,2021-11-18,True,0.75,1,King Richard,2021-11-19,1,1.873934


In [None]:
avgEarlyReviews = pd.DataFrame(columns=('title', 'avgScore', 'avgBinary'))
i=0
for title in pd.unique(earlyReviewsWithMultipliers.title).tolist:
    earlyReviewsWithMultipliers.iloc[i,earlyReviewsWithMultipliers[earlyReviewsWithMultipliers.title=='King Richard'].originalScore.mean()]

array(['addicted_2014', 'the_duff', 'falling_down', ..., 'nomadland',
       'charlies_angels_2019', 'thor_love_and_thunder'],
      shape=(4025,), dtype=object)