In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
# Call the csv.file in to a file path
movies_path = 'Resources/movies_people2numbers.csv'
director_data_path = 'Resources/director_data_avg.csv'

In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv(movies_path)
director_data_df = pd.read_csv(director_data_path)

In [4]:
# Read in the Director Data CSV file
director_data_df.head()

Unnamed: 0,Director,Instances,Total_Box_Office,Total_Production_Budget,Total_Metascore,Total_IMDB_Rating,Average_Box_Office,Average_IMDB_Rating,Success_Metric
0,James Cameron,8,2673746000.0,1190400000,593.0,63.3,334218300.0,7.9125,14.596865
1,Anthony Russo,8,,1426000000,491.0,57.4,,7.175,
2,Joe Russo,8,,1426000000,491.0,57.4,,7.175,
3,Rob Marshall,6,788887600.0,760200000,364.0,39.5,131481300.0,6.583333,9.212959
4,Joss Whedon,5,1756751000.0,714000000,352.0,38.9,351350100.0,7.78,14.807003


In [5]:
print(df['OMDB_Rated'].unique())


['teens' 'unknown' 'kids' 'adults']


## Movies Rated for Kids

In [6]:
kids = df[(df['OMDB_Rated'] == 'kids')]
kids.head()

Unnamed: 0.1,Unnamed: 0,Movie,ProductionBudget,DomesticGross,WorldwideGross,OMDB_Title,OMDB_Rated,OMDB_Runtime,OMDB_Genre,OMDB_Director,...,ReleaseMonth,Director_Avg_Box,Director_Avg_Rating,Director_Score,Writer_Avg_Box,Writer_Avg_Rating,Writer_Score,Actor_Avg_Box,Actor_Avg_Rating,Actor_Score
15,15,The Lion King,260000000,543638043,1646106779,The Lion King,kids,88.0,"Animation, Adventure, Drama","Roger Allers, Rob Minkoff",...,6.0,198162300.0,7.06,11.02,266728900.0,7.43,12.77,125553000.0,6.95,9.44
16,16,Tangled,260000000,200821936,582440151,Tangled,kids,100.0,"Animation, Adventure, Comedy","Nathan Greno, Byron Howard",...,11.0,194440600.0,7.56,11.45,105362100.0,6.75,8.86,132757000.0,6.7,9.36
22,22,Harry Potter and the Half-Blood Prince,250000000,302089278,929411069,Harry Potter and the Half-Blood Prince,kids,153.0,"Action, Adventure, Family",David Yates,...,7.0,201800700.0,6.87,10.9,235177400.0,7.3,12.0,178589800.0,7.17,10.74
23,23,The Little Mermaid,250000000,298172056,568345048,The Little Mermaid,kids,83.0,"Animation, Adventure, Family","Ron Clements, John Musker",...,11.0,141975400.0,7.51,10.35,204635800.0,7.43,11.52,115910900.0,7.4,9.72
37,37,The Chronicles of Narnia: Prince Caspian,225000000,141621490,417341288,The Chronicles of Narnia: Prince Caspian,kids,150.0,"Action, Adventure, Family",Andrew Adamson,...,5.0,215880900.0,7.1,11.42,229520100.0,7.01,11.24,88756510.0,6.33,8.11


In [7]:
kids.columns

Index(['Unnamed: 0', 'Movie', 'ProductionBudget', 'DomesticGross',
       'WorldwideGross', 'OMDB_Title', 'OMDB_Rated', 'OMDB_Runtime',
       'OMDB_Genre', 'OMDB_Director', 'OMDB_Writer', 'OMDB_Actors',
       'OMDB_Language', 'OMDB_Country', 'OMDB_Metascore', 'OMDB_imdbRating',
       'OMDB_imdbVotes', 'OMDB_BoxOffice', 'OMDB_Production', 'ReleaseYear',
       'ReleaseMonth', 'Director_Avg_Box', 'Director_Avg_Rating',
       'Director_Score', 'Writer_Avg_Box', 'Writer_Avg_Rating', 'Writer_Score',
       'Actor_Avg_Box', 'Actor_Avg_Rating', 'Actor_Score'],
      dtype='object')

In [242]:
# Mean Squared Error: 0.16813060090473145
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.13352428378378384
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.13417093918918938
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.13155986486486507
features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth','ReleaseYear']
target='OMDB_imdbRating'

# Mean Squared Error: 0.13257979729729735
# features = ['ProductionBudget','Director_Avg_Rating','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.1329218243243243
# features = ['ProductionBudget','Director_Avg_Rating','Director_Score','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','Actor_Score']
# target='OMDB_imdbRating'

In [243]:
# Drop rows with missing values in the selected features and target
kids = kids.dropna(subset=features + [target])

In [244]:
# le = LabelEncoder()
# pg_and_g_movies['OMDB_Rated'] = le.fit_transform(pg_and_g_movies['OMDB_Rated'])

In [245]:
# Extract features and target variable
X = kids[features]
y = kids[target]

In [246]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [247]:
# Initialize and train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [248]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [249]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.13155986486486507


# Movies Rated for Teens

In [59]:
teens = df[(df['OMDB_Rated'] == 'teens')]
teens.head()

Unnamed: 0.1,Unnamed: 0,Movie,ProductionBudget,DomesticGross,WorldwideGross,OMDB_Title,OMDB_Rated,OMDB_Runtime,OMDB_Genre,OMDB_Director,...,ReleaseMonth,Director_Avg_Box,Director_Avg_Rating,Director_Score,Writer_Avg_Box,Writer_Avg_Rating,Writer_Score,Actor_Avg_Box,Actor_Avg_Rating,Actor_Score
0,0,Avatar: The Way of Water,460000000,684075767,2319591720,Avatar: The Way of Water,teens,192.0,"Action, Adventure, Fantasy",James Cameron,...,12.0,334218300.0,7.91,14.6,285075000.0,7.33,13.03,161177700.0,6.56,9.78
1,1,Avengers: Endgame,400000000,858373000,2788912285,Avengers: Endgame,teens,181.0,"Action, Adventure, Drama","Anthony Russo, Joe Russo",...,4.0,,7.18,,421891400.0,7.16,15.53,168838900.0,6.97,10.38
2,2,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,teens,136.0,"Action, Adventure, Fantasy",Rob Marshall,...,5.0,131481300.0,6.58,9.21,176200900.0,6.81,10.33,83911340.0,6.5,8.44
3,3,Avengers: Age of Ultron,365000000,459005868,1395316979,Avengers: Age of Ultron,teens,141.0,"Action, Adventure, Sci-Fi",Joss Whedon,...,5.0,351350100.0,7.78,14.81,327398500.0,6.71,13.71,168838900.0,6.97,10.38
4,4,Fast X,340000000,145960660,714414576,Fast X,teens,141.0,"Action, Adventure, Crime",Louis Leterrier,...,5.0,108156100.0,6.31,8.48,97760420.0,5.97,7.81,97319840.0,6.17,8.09


In [60]:
teens.columns

Index(['Unnamed: 0', 'Movie', 'ProductionBudget', 'DomesticGross',
       'WorldwideGross', 'OMDB_Title', 'OMDB_Rated', 'OMDB_Runtime',
       'OMDB_Genre', 'OMDB_Director', 'OMDB_Writer', 'OMDB_Actors',
       'OMDB_Language', 'OMDB_Country', 'OMDB_Metascore', 'OMDB_imdbRating',
       'OMDB_imdbVotes', 'OMDB_BoxOffice', 'OMDB_Production', 'ReleaseYear',
       'ReleaseMonth', 'Director_Avg_Box', 'Director_Avg_Rating',
       'Director_Score', 'Writer_Avg_Box', 'Writer_Avg_Rating', 'Writer_Score',
       'Actor_Avg_Box', 'Actor_Avg_Rating', 'Actor_Score'],
      dtype='object')

In [228]:
# Mean Squared Error: 0.2396432166992439
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.19846093397042086
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.19631951817216522
features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth']
target='OMDB_imdbRating'

# Mean Squared Error: 0.20314064505119453
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth','ReleaseYear']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.1986653175123248
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Writer_Score','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.2021205239666289
# features = ['ProductionBudget','Director_Avg_Rating','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.1982368520932879
# features = ['ProductionBudget','Director_Avg_Rating','Director_Score','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.19663169275692077
# features = ['ProductionBudget','Director_Avg_Rating','Director_Score','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','Actor_Score','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.1974802224345848
# features = ['ProductionBudget','Director_Avg_Rating','Director_Score','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','Actor_Score']
# target='OMDB_imdbRating'

In [229]:
# Drop rows with missing values in the selected features and target
teens = teens.dropna(subset=features + [target])

In [230]:
# Extract features and target variable
X = teens[features]
y = teens[target]

In [231]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [232]:
# Initialize and train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [233]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [234]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.20314064505119453


# Movies Rated for Adults

In [83]:
adults = df[(df['OMDB_Rated'] == 'adults')]
adults.head()

Unnamed: 0.1,Unnamed: 0,Movie,ProductionBudget,DomesticGross,WorldwideGross,OMDB_Title,OMDB_Rated,OMDB_Runtime,OMDB_Genre,OMDB_Director,...,ReleaseMonth,Director_Avg_Box,Director_Avg_Rating,Director_Score,Writer_Avg_Box,Writer_Avg_Rating,Writer_Score,Actor_Avg_Box,Actor_Avg_Rating,Actor_Score
90,90,Killers of the Flower Moon,200000000,62180032,139359510,Killers of the Flower Moon,adults,206.0,"Crime, Drama, History",Martin Scorsese,...,10.0,,7.6,,52245750.0,7.55,8.59,80141236.67,7.46,9.42
106,106,The Matrix Resurrections,190000000,40463197,159197755,The Matrix Resurrections,adults,148.0,"Action, Sci-Fi",Lana Wachowski,...,12.0,94112270.0,6.8,8.68,51818520.0,6.38,7.42,48259669.29,6.35,7.1
115,115,Blade Runner 2049,185000000,92054159,257767797,Blade Runner 2049,adults,164.0,"Action, Drama, Mystery",Denis Villeneuve,...,10.0,,7.9,,80261360.0,7.37,8.97,98936636.61,6.83,8.75
116,116,Terminator: Dark Fate,185000000,62253077,250367666,Terminator: Dark Fate,adults,128.0,"Action, Adventure, Sci-Fi",Tim Miller,...,11.0,212661900.0,7.1,11.35,99936730.0,6.51,8.51,52338500.18,6.01,7.06
117,117,The Suicide Squad,185000000,55817425,167097737,The Suicide Squad,adults,132.0,"Action, Adventure, Comedy",James Gunn,...,8.0,191079200.0,7.32,11.14,137476800.0,6.83,9.58,81127471.42,6.5,8.12


In [84]:
adults.columns

Index(['Unnamed: 0', 'Movie', 'ProductionBudget', 'DomesticGross',
       'WorldwideGross', 'OMDB_Title', 'OMDB_Rated', 'OMDB_Runtime',
       'OMDB_Genre', 'OMDB_Director', 'OMDB_Writer', 'OMDB_Actors',
       'OMDB_Language', 'OMDB_Country', 'OMDB_Metascore', 'OMDB_imdbRating',
       'OMDB_imdbVotes', 'OMDB_BoxOffice', 'OMDB_Production', 'ReleaseYear',
       'ReleaseMonth', 'Director_Avg_Box', 'Director_Avg_Rating',
       'Director_Score', 'Writer_Avg_Box', 'Writer_Avg_Rating', 'Writer_Score',
       'Actor_Avg_Box', 'Actor_Avg_Rating', 'Actor_Score'],
      dtype='object')

In [235]:
# Mean Squared Error: 0.1894942956050434
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.16529893277850685
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.16511039998323118
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.16024384514435697
features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth','ReleaseYear']
target='OMDB_imdbRating'

# Mean Squared Error: 0.164431870709197
# features = ['ProductionBudget','Director_Avg_Box','Director_Avg_Rating','Writer_Avg_Box','Writer_Avg_Rating','Writer_Score','Actor_Avg_Box','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

# Mean Squared Error: 0.16557388960921546
# features = ['ProductionBudget','Director_Avg_Rating','Writer_Avg_Rating','Writer_Score','Actor_Avg_Rating','ReleaseMonth']
# target='OMDB_imdbRating'

In [236]:
# Drop rows with missing values in the selected features and target
adults = adults.dropna(subset=features + [target])

In [237]:
# Extract features and target variable
X = adults[features]
y = adults[target]

In [238]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [239]:
# Initialize and train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [240]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [241]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.16024384514435697
