## Load Data

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

#  Import and read the data
application_df = pd.read_csv("./Resources/movies_people2numbers.csv")
application_df.head()

Using TensorFlow backend


Unnamed: 0.1,Unnamed: 0,Movie,ProductionBudget,DomesticGross,WorldwideGross,OMDB_Title,OMDB_Rated,OMDB_Runtime,OMDB_Genre,OMDB_Director,...,ReleaseMonth,Director_Avg_Box,Director_Avg_Rating,Director_Score,Writer_Avg_Box,Writer_Avg_Rating,Writer_Score,Actor_Avg_Box,Actor_Avg_Rating,Actor_Score
0,0,Avatar: The Way of Water,460000000,684075767,2319591720,Avatar: The Way of Water,teens,192.0,"Action, Adventure, Fantasy",James Cameron,...,12.0,334218300.0,7.91,14.6,285075000.0,7.33,13.03,161177700.0,6.56,9.78
1,1,Avengers: Endgame,400000000,858373000,2788912285,Avengers: Endgame,teens,181.0,"Action, Adventure, Drama","Anthony Russo, Joe Russo",...,4.0,,7.18,,421891400.0,7.16,15.53,168838900.0,6.97,10.38
2,2,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,teens,136.0,"Action, Adventure, Fantasy",Rob Marshall,...,5.0,131481300.0,6.58,9.21,176200900.0,6.81,10.33,83911340.0,6.5,8.44
3,3,Avengers: Age of Ultron,365000000,459005868,1395316979,Avengers: Age of Ultron,teens,141.0,"Action, Adventure, Sci-Fi",Joss Whedon,...,5.0,351350100.0,7.78,14.81,327398500.0,6.71,13.71,168838900.0,6.97,10.38
4,4,Fast X,340000000,145960660,714414576,Fast X,teens,141.0,"Action, Adventure, Crime",Louis Leterrier,...,5.0,108156100.0,6.31,8.48,97760420.0,5.97,7.81,97319840.0,6.17,8.09


In [2]:
application_df.columns

Index(['Unnamed: 0', 'Movie', 'ProductionBudget', 'DomesticGross',
       'WorldwideGross', 'OMDB_Title', 'OMDB_Rated', 'OMDB_Runtime',
       'OMDB_Genre', 'OMDB_Director', 'OMDB_Writer', 'OMDB_Actors',
       'OMDB_Language', 'OMDB_Country', 'OMDB_Metascore', 'OMDB_imdbRating',
       'OMDB_imdbVotes', 'OMDB_BoxOffice', 'OMDB_Production', 'ReleaseYear',
       'ReleaseMonth', 'Director_Avg_Box', 'Director_Avg_Rating',
       'Director_Score', 'Writer_Avg_Box', 'Writer_Avg_Rating', 'Writer_Score',
       'Actor_Avg_Box', 'Actor_Avg_Rating', 'Actor_Score'],
      dtype='object')

# Model for IMDB Rating for a Movie Maker with a plan

In [3]:
df_rating = application_df[['ProductionBudget', 'OMDB_Rated', 'Director_Avg_Rating', 'OMDB_Runtime', 'ReleaseMonth', 
                            'Writer_Avg_Rating', 'Actor_Avg_Rating', 'OMDB_imdbRating']]
df_rating = df_rating.dropna()
print(len(df_rating))
df_rating.head()

5532


Unnamed: 0,ProductionBudget,OMDB_Rated,Director_Avg_Rating,OMDB_Runtime,ReleaseMonth,Writer_Avg_Rating,Actor_Avg_Rating,OMDB_imdbRating
0,460000000,teens,7.91,192.0,12.0,7.33,6.56,7.6
1,400000000,teens,7.18,181.0,4.0,7.16,6.97,8.4
2,379000000,teens,6.58,136.0,5.0,6.81,6.5,6.6
3,365000000,teens,7.78,141.0,5.0,6.71,6.97,7.3
4,340000000,teens,6.31,141.0,5.0,5.97,6.17,5.8


## Set the non-numerical data to dummy columns and set features vs. target

In [4]:
df_rating = pd.get_dummies(data=df_rating, columns=['OMDB_Rated'])
df_rating.head()

Unnamed: 0,ProductionBudget,Director_Avg_Rating,OMDB_Runtime,ReleaseMonth,Writer_Avg_Rating,Actor_Avg_Rating,OMDB_imdbRating,OMDB_Rated_adults,OMDB_Rated_kids,OMDB_Rated_teens,OMDB_Rated_unknown
0,460000000,7.91,192.0,12.0,7.33,6.56,7.6,0,0,1,0
1,400000000,7.18,181.0,4.0,7.16,6.97,8.4,0,0,1,0
2,379000000,6.58,136.0,5.0,6.81,6.5,6.6,0,0,1,0
3,365000000,7.78,141.0,5.0,6.71,6.97,7.3,0,0,1,0
4,340000000,6.31,141.0,5.0,5.97,6.17,5.8,0,0,1,0


In [5]:
y = df_rating['OMDB_imdbRating']
X = df_rating.drop(columns='OMDB_imdbRating')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize the Random Forest Regressor
rf_model_rating = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model_rating.fit(X_train, y_train)

# Make predictions on the test set
rating_predictions = rf_model_rating.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, rating_predictions)
print(f'Mean Squared Error: {mse}')

# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model_rating.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model_rating.feature_importances_, X.columns), reverse=True)

Mean Squared Error: 0.14343423938572725


[(0.775962504089004, 'Writer_Avg_Rating'),
 (0.08416091027943252, 'Actor_Avg_Rating'),
 (0.07750321349719089, 'Director_Avg_Rating'),
 (0.024536243031825198, 'ProductionBudget'),
 (0.01961432744416958, 'OMDB_Runtime'),
 (0.011969548930016472, 'ReleaseMonth'),
 (0.002429811626970836, 'OMDB_Rated_teens'),
 (0.002042909616134324, 'OMDB_Rated_adults'),
 (0.0014000102702094257, 'OMDB_Rated_kids'),
 (0.00038052121504678044, 'OMDB_Rated_unknown')]

# Applying Keras Tuner

In [7]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

# Skipping the scaling:
X_train_scaled = X_train
X_test_scaled = X_test

In [8]:
def create_model(hp):
    movie_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide which optimizer
    optimizer = hp.Choice('opt', values=['adam', 'sgd', 'rmsprop'])
    
    # Allow kerastuner to decide number of neurons in first layer
    movie_model.add(tf.keras.layers.Dense(units=hp.Int('first_units', min_value=1, max_value=10, step=2), 
                                          activation=activation, input_dim=10))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        movie_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    movie_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    # Compile the model
    movie_model.compile(loss="mse", optimizer=optimizer, metrics=[tf.keras.metrics.MeanSquaredError()])

    return movie_model

In [9]:
tuner = kt.Hyperband(
    create_model,
    objective="val_loss",
    max_epochs=20,
    hyperband_iterations=2, 
    project_name='movie_viewers')

In [10]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 06s]
val_loss: 29.748559951782227

Best val_loss So Far: 29.711294174194336
Total elapsed time: 00h 02m 43s


In [11]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'opt': 'sgd',
 'first_units': 7,
 'num_layers': 2,
 'units_0': 9,
 'units_1': 7,
 'units_2': 1,
 'units_3': 5,
 'units_4': 1,
 'units_5': 7,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0}

In [12]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, MSE: {model_accuracy}")

35/35 - 0s - loss: 29.7113 - mean_squared_error: 29.7113 - 100ms/epoch - 3ms/step
Loss: 29.711294174194336, MSE: 29.711294174194336


In [13]:
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 7)                 77        
                                                                 
 dense_1 (Dense)             (None, 9)                 72        
                                                                 
 dense_2 (Dense)             (None, 7)                 70        
                                                                 
 dense_3 (Dense)             (None, 1)                 8         
                                                                 
Total params: 227 (908.00 Byte)
Trainable params: 227 (908.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Model for Viewer using Metascore as a critic review available before movie release

In [14]:
df_rating_v = application_df[['ProductionBudget', 'OMDB_Rated', 'Director_Avg_Rating', 'OMDB_Runtime', 'ReleaseMonth', 
                            'Writer_Avg_Rating', 'Actor_Avg_Rating', 'OMDB_imdbRating', 'OMDB_Metascore']]
df_rating_v = df_rating_v.dropna()
print(len(df_rating_v))
df_rating_v.head()

4915


Unnamed: 0,ProductionBudget,OMDB_Rated,Director_Avg_Rating,OMDB_Runtime,ReleaseMonth,Writer_Avg_Rating,Actor_Avg_Rating,OMDB_imdbRating,OMDB_Metascore
0,460000000,teens,7.91,192.0,12.0,7.33,6.56,7.6,67.0
1,400000000,teens,7.18,181.0,4.0,7.16,6.97,8.4,78.0
2,379000000,teens,6.58,136.0,5.0,6.81,6.5,6.6,45.0
3,365000000,teens,7.78,141.0,5.0,6.71,6.97,7.3,66.0
4,340000000,teens,6.31,141.0,5.0,5.97,6.17,5.8,56.0


In [15]:
df_rating_v = pd.get_dummies(data=df_rating_v, columns=['OMDB_Rated'])
df_rating_v.head()

Unnamed: 0,ProductionBudget,Director_Avg_Rating,OMDB_Runtime,ReleaseMonth,Writer_Avg_Rating,Actor_Avg_Rating,OMDB_imdbRating,OMDB_Metascore,OMDB_Rated_adults,OMDB_Rated_kids,OMDB_Rated_teens,OMDB_Rated_unknown
0,460000000,7.91,192.0,12.0,7.33,6.56,7.6,67.0,0,0,1,0
1,400000000,7.18,181.0,4.0,7.16,6.97,8.4,78.0,0,0,1,0
2,379000000,6.58,136.0,5.0,6.81,6.5,6.6,45.0,0,0,1,0
3,365000000,7.78,141.0,5.0,6.71,6.97,7.3,66.0,0,0,1,0
4,340000000,6.31,141.0,5.0,5.97,6.17,5.8,56.0,0,0,1,0


In [16]:
y_v = df_rating_v['OMDB_imdbRating']
X_v = df_rating_v.drop(columns='OMDB_imdbRating')

# Split the data into training and testing sets
X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(X_v, y_v, test_size=0.2, random_state=42)

In [17]:
# Initialize the Random Forest Regressor
rf_model_rating_v = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model_rating_v.fit(X_train_v, y_train_v)

# Make predictions on the test set
rating_predictions_v = rf_model_rating_v.predict(X_test_v)

# Evaluate the model
mse_v = mean_squared_error(y_test_v, rating_predictions_v)
print(f'Mean Squared Error: {mse_v}')

# Random Forests in sklearn will automatically calculate feature importance
importances_v = rf_model_rating_v.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model_rating_v.feature_importances_, X_v.columns), reverse=True)

Mean Squared Error: 0.14351435808748733


[(0.7444759511126883, 'Writer_Avg_Rating'),
 (0.07199632693365078, 'Actor_Avg_Rating'),
 (0.06634227391786834, 'Director_Avg_Rating'),
 (0.057195323399641784, 'OMDB_Metascore'),
 (0.02264209080913981, 'ProductionBudget'),
 (0.019797193791530734, 'OMDB_Runtime'),
 (0.011693556884447107, 'ReleaseMonth'),
 (0.002185500858769274, 'OMDB_Rated_teens'),
 (0.0019713422677732195, 'OMDB_Rated_adults'),
 (0.0014500544966128733, 'OMDB_Rated_kids'),
 (0.00025038552787777073, 'OMDB_Rated_unknown')]

## Applying Keras Tuner

In [18]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler_v = scaler.fit(X_train_v)

# # Scale the data
# X_train_scaled_v = X_scaler_v.transform(X_train_v)
# X_test_scaled_v = X_scaler_v.transform(X_test_v)

# Skipping the scaling:
X_train_scaled_v = X_train_v
X_test_scaled_v = X_test_v

In [19]:
def create_model_v(hp):
    movie_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide which optimizer
    optimizer = hp.Choice('opt', values=['adam', 'sgd', 'rmsprop'])
    
    # Allow kerastuner to decide number of neurons in first layer
    movie_model.add(tf.keras.layers.Dense(units=hp.Int('first_units', min_value=1, max_value=10, step=2), 
                                          activation=activation, input_dim=11))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        movie_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    movie_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    # Compile the model
    movie_model.compile(loss="mse", optimizer='sgd', metrics=[tf.keras.metrics.MeanSquaredError()])

    return movie_model

In [20]:
tuner_v = kt.Hyperband(
    create_model_v,
    objective="val_loss",
    max_epochs=20,
    hyperband_iterations=2, 
    project_name='movie_makers')

In [21]:
# Run the kerastuner search for best hyperparameters
tuner_v.search(X_train_scaled_v,y_train_v,epochs=20,validation_data=(X_test_scaled_v,y_test_v))

Trial 60 Complete [00h 00m 07s]
val_loss: 30.799524307250977

Best val_loss So Far: 30.79459571838379
Total elapsed time: 00h 02m 33s


In [22]:
# Get best model hyperparameters
best_hyper_v = tuner_v.get_best_hyperparameters(1)[0]
best_hyper_v.values

{'activation': 'relu',
 'opt': 'rmsprop',
 'first_units': 3,
 'num_layers': 2,
 'units_0': 7,
 'units_1': 3,
 'units_2': 7,
 'units_3': 9,
 'units_4': 3,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0}

In [23]:
# Evaluate best model against full test data
best_model_v = tuner_v.get_best_models(1)[0]
model_loss_v, model_accuracy_v = best_model_v.evaluate(X_test_scaled_v,y_test_v,verbose=2)
print(f"Loss: {model_loss_v}, MSE: {model_accuracy_v}")

31/31 - 0s - loss: 30.7946 - mean_squared_error: 30.7946 - 110ms/epoch - 4ms/step
Loss: 30.79459571838379, MSE: 30.79459571838379


In [24]:
best_model_v.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 36        
                                                                 
 dense_1 (Dense)             (None, 7)                 28        
                                                                 
 dense_2 (Dense)             (None, 3)                 24        
                                                                 
 dense_3 (Dense)             (None, 1)                 4         
                                                                 
Total params: 92 (368.00 Byte)
Trainable params: 92 (368.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
