In [10]:
import pandas as pd

# Load IMDB dataset
imdb_df = pd.read_csv('IMDB Dataset.csv')

# Preprocess the IMDB dataset
def preprocess_imdb_data(df):
    df['review'] = df['review'].str.replace('<br />', ' ')
    df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return df

imdb_df = preprocess_imdb_data(imdb_df)
print(imdb_df.head(10))


                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production.   The filming t...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1
5  Probably my all-time favorite movie, a story o...          1
6  I sure would like to see a resurrection of a u...          1
7  This show was an amazing, fresh & innovative i...          0
8  Encouraged by the positive comments about this...          0
9  If you like original gut wrenching laughter yo...          1


## Data processing

In [2]:
import json

# Load TMDB dataset
tmdb_df = pd.read_csv('tmdb_5000_movies.csv')

# Preprocess the TMDB dataset
def preprocess_tmdb_data(df):
    df['genres'] = df['genres'].apply(lambda x: ', '.join([genre['name'] for genre in json.loads(x)]))
    df['keywords'] = df['keywords'].apply(lambda x: ', '.join([keyword['name'] for keyword in json.loads(x)]))
    df['production_companies'] = df['production_companies'].apply(lambda x: ', '.join([company['name'] for company in json.loads(x)]))
    df['production_countries'] = df['production_countries'].apply(lambda x: ', '.join([country['name'] for country in json.loads(x)]))
    df['spoken_languages'] = df['spoken_languages'].apply(lambda x: ', '.join([language['name'] for language in json.loads(x)]))
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['release_year'] = df['release_date'].dt.year
    return df

tmdb_df = preprocess_tmdb_data(tmdb_df)
print(tmdb_df.head())


      budget                                       genres  \
0  237000000  Action, Adventure, Fantasy, Science Fiction   
1  300000000                   Adventure, Fantasy, Action   
2  245000000                     Action, Adventure, Crime   
3  250000000               Action, Crime, Drama, Thriller   
4  260000000           Action, Adventure, Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash, future, space war, space colony...                en   
1  ocean, drug abuse, exotic island, east india t...                en   
2  spy, based on novel, secret agent, sequel, mi6...    

In [3]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# Combine relevant features from TMDB dataset for modeling
selected_features = ['budget', 'popularity', 'release_year', 'runtime', 'vote_average', 'vote_count', 'revenue']
tmdb_selected_df = tmdb_df[selected_features]

# Drop rows with missing values
tmdb_selected_df = tmdb_selected_df.dropna()

# Split data into features and target variables
X = tmdb_selected_df.drop(columns=['vote_average', 'revenue'])
y_rating = tmdb_selected_df['vote_average']
y_revenue = tmdb_selected_df['revenue']

# Split the data into training and testing sets
X_train, X_test, y_train_rating, y_test_rating = train_test_split(X, y_rating, test_size=0.2, random_state=42)
X_train, X_test, y_train_revenue, y_test_revenue = train_test_split(X, y_revenue, test_size=0.2, random_state=42)

## Random Forest Implementation

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
# Train Random Forest models
rating_model = RandomForestRegressor(n_estimators=100, random_state=42)
rating_model.fit(X_train, y_train_rating)
revenue_model = RandomForestRegressor(n_estimators=100, random_state=42)
revenue_model.fit(X_train, y_train_revenue)

# Evaluate the models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

mae, rmse, r2 = evaluate_model(rating_model, X_test, y_test_rating)
print(f'Rating Model - MAE: {mae}, RMSE: {rmse}, R2: {r2}')

mae, rmse, r2 = evaluate_model(revenue_model, X_test, y_test_revenue)
print(f'Revenue Model - MAE: {mae}, RMSE: {rmse}, R2: {r2}')

Rating Model - MAE: 0.5765614583333333, RMSE: 0.7784416076099306, R2: 0.5103187690547113
Revenue Model - MAE: 41451530.35896875, RMSE: 87075235.38772306, R2: 0.7149126915465905


In [5]:
X_test

Unnamed: 0,budget,popularity,release_year,runtime,vote_count
596,70000000,13.267631,2002.0,97.0,269
3371,0,2.881239,2007.0,93.0,18
3049,0,5.842299,2004.0,89.0,101
2909,11500000,4.759190,2009.0,101.0,29
8,250000000,98.885637,2009.0,153.0,5293
...,...,...,...,...,...
2969,0,1.985552,2004.0,97.0,6
198,130000000,39.448066,2013.0,96.0,1260
2422,17000000,32.746486,2012.0,112.0,2247
1485,33000000,7.339908,2000.0,96.0,110


In [6]:
X_new = X_test[:10]  # Take the first 10 rows of X_test for predictions
movies_subset = tmdb_df.iloc[:10]  # Subset of movie details corresponding to X_new

# Predict ratings and revenues
predicted_ratings = rating_model.predict(X_new)
predicted_revenues = revenue_model.predict(X_new)

# Print predictions with movie details
print("Detailed Predictions:")
for i, (rating, revenue) in enumerate(zip(predicted_ratings, predicted_revenues)):
    movie_name = movies_subset.iloc[i]['original_title']
    running_time = movies_subset.iloc[i]['runtime']
    popularity = movies_subset.iloc[i]['popularity']
    release_year = movies_subset.iloc[i]['release_year']
    vote_count = movies_subset.iloc[i]['vote_count']
    
    print(f"Movie: {movie_name}")
    print(f"  Predicted Rating: {rating:.2f}")
    print(f"  Predicted Revenue: ${revenue:.2f}")
    print(f"  Running Time: {running_time} minutes")
    print(f"  Popularity: {popularity}")
    print(f"  Release Year: {release_year}")
    print(f"  Vote Count: {vote_count}")
    print("---------------------------")

Detailed Predictions:
Movie: Avatar
  Predicted Rating: 5.64
  Predicted Revenue: $54871449.38
  Running Time: 162.0 minutes
  Popularity: 150.437577
  Release Year: 2009.0
  Vote Count: 11800
---------------------------
Movie: Pirates of the Caribbean: At World's End
  Predicted Rating: 5.90
  Predicted Revenue: $7.36
  Running Time: 169.0 minutes
  Popularity: 139.082615
  Release Year: 2007.0
  Vote Count: 4500
---------------------------
Movie: Spectre
  Predicted Rating: 6.29
  Predicted Revenue: $203077.50
  Running Time: 148.0 minutes
  Popularity: 107.376788
  Release Year: 2015.0
  Vote Count: 4466
---------------------------
Movie: The Dark Knight Rises
  Predicted Rating: 5.83
  Predicted Revenue: $7450670.66
  Running Time: 165.0 minutes
  Popularity: 112.31295
  Release Year: 2012.0
  Vote Count: 9106
---------------------------
Movie: John Carter
  Predicted Rating: 6.91
  Predicted Revenue: $858307816.63
  Running Time: 132.0 minutes
  Popularity: 43.926995
  Release Yea

## XGBoost Implementation

In [7]:
import xgboost as xgb
import numpy as np
# XGBoost model for predicting ratings
xgb_rating_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_rating_model.fit(X_train, y_train_rating)

# XGBoost model for predicting revenues
xgb_revenue_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_revenue_model.fit(X_train, y_train_revenue)

# Evaluate XGBoost models
def evaluate_xgb_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

# Evaluation for ratings model
xgb_rating_mae, xgb_rating_rmse, xgb_rating_r2 = evaluate_xgb_model(xgb_rating_model, X_test, y_test_rating)

# Evaluation for revenue model
xgb_revenue_mae, xgb_revenue_rmse, xgb_revenue_r2 = evaluate_xgb_model(xgb_revenue_model, X_test, y_test_revenue)

# Print evaluation results
print("XGBoost Model Evaluation - Ratings:")
print(f"MAE: {xgb_rating_mae}")
print(f"RMSE: {xgb_rating_rmse}")
print(f"R2 Score: {xgb_rating_r2}")
print("---------------------------")
print("XGBoost Model Evaluation - Revenues:")
print(f"MAE: {xgb_revenue_mae}")
print(f"RMSE: {xgb_revenue_rmse}")
print(f"R2 Score: {xgb_revenue_r2}")

# Predict using XGBoost models
predicted_ratings_xgb = xgb_rating_model.predict(X_new)
predicted_revenues_xgb = xgb_revenue_model.predict(X_new)

print("XGBoost Predictions:")
for i, (rating, revenue) in enumerate(zip(predicted_ratings_xgb, predicted_revenues_xgb)):
    movie_name = movies_subset.iloc[i]['original_title']
    running_time = movies_subset.iloc[i]['runtime']
    popularity = movies_subset.iloc[i]['popularity']
    release_year = movies_subset.iloc[i]['release_year']
    vote_count = movies_subset.iloc[i]['vote_count']
    
    print(f"Movie: {movie_name}")
    print(f"  Predicted Rating (XGBoost): {rating:.2f}")
    print(f"  Predicted Revenue (XGBoost): ${revenue:.2f}")
    print(f"  Running Time: {running_time} minutes")
    print(f"  Popularity: {popularity}")
    print(f"  Release Year: {release_year}")
    print(f"  Vote Count: {vote_count}")
    print("---------------------------")

XGBoost Model Evaluation - Ratings:
MAE: 0.6059364631089073
RMSE: 0.8213797302930068
R2 Score: 0.45480817002122453
---------------------------
XGBoost Model Evaluation - Revenues:
MAE: 43537141.604473874
RMSE: 102737035.7365305
R2 Score: 0.6031351538452152
XGBoost Predictions:
Movie: Avatar
  Predicted Rating (XGBoost): 5.37
  Predicted Revenue (XGBoost): $55959508.00
  Running Time: 162.0 minutes
  Popularity: 150.437577
  Release Year: 2009.0
  Vote Count: 11800
---------------------------
Movie: Pirates of the Caribbean: At World's End
  Predicted Rating (XGBoost): 5.64
  Predicted Revenue (XGBoost): $489736.91
  Running Time: 169.0 minutes
  Popularity: 139.082615
  Release Year: 2007.0
  Vote Count: 4500
---------------------------
Movie: Spectre
  Predicted Rating (XGBoost): 5.95
  Predicted Revenue (XGBoost): $3804094.25
  Running Time: 148.0 minutes
  Popularity: 107.376788
  Release Year: 2015.0
  Vote Count: 4466
---------------------------
Movie: The Dark Knight Rises
  Pred

# Neural Networks Implementation

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Split the data into training and testing sets
X_train, X_test, y_train_rating, y_test_rating = train_test_split(X, y_rating, test_size=0.2, random_state=42)
X_train, X_test, y_train_revenue, y_test_revenue = train_test_split(X, y_revenue, test_size=0.2, random_state=42)

# Neural Network model for predicting ratings
nn_rating_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])
nn_rating_model.compile(optimizer='adam', loss='mean_squared_error')
nn_rating_model.fit(X_train, y_train_rating, epochs=50, batch_size=32, verbose=0)

# Neural Network model for predicting revenues
nn_revenue_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])
nn_revenue_model.compile(optimizer='adam', loss='mean_squared_error')
nn_revenue_model.fit(X_train, y_train_revenue, epochs=50, batch_size=32, verbose=0)

# Evaluate Neural Network models
def evaluate_nn_model(model, X_test, y_test):
    y_pred = model.predict(X_test).flatten()
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

# Evaluation for ratings model
nn_rating_mae, nn_rating_rmse, nn_rating_r2 = evaluate_nn_model(nn_rating_model, X_test, y_test_rating)

# Evaluation for revenue model
nn_revenue_mae, nn_revenue_rmse, nn_revenue_r2 = evaluate_nn_model(nn_revenue_model, X_test, y_test_revenue)

# Print evaluation results
print("Neural Network Model Evaluation - Ratings:")
print(f"MAE: {nn_rating_mae}")
print(f"RMSE: {nn_rating_rmse}")
print(f"R2 Score: {nn_rating_r2}")
print("---------------------------")
print("Neural Network Model Evaluation - Revenues:")
print(f"MAE: {nn_revenue_mae}")
print(f"RMSE: {nn_revenue_rmse}")
print(f"R2 Score: {nn_revenue_r2}")

# Predict using Neural Network models
predicted_ratings_nn = nn_rating_model.predict(X_new).flatten()
predicted_revenues_nn = nn_revenue_model.predict(X_new).flatten()
print("Neural Network Predictions:")
for i, (rating, revenue) in enumerate(zip(predicted_ratings_nn, predicted_revenues_nn)):
    movie_name = movies_subset.iloc[i]['original_title']
    running_time = movies_subset.iloc[i]['runtime']
    popularity = movies_subset.iloc[i]['popularity']
    release_year = movies_subset.iloc[i]['release_year']
    vote_count = movies_subset.iloc[i]['vote_count']
    
    print(f"Movie: {movie_name}")
    print(f"  Predicted Rating (Neural Network): {rating:.2f}")
    print(f"  Predicted Revenue (Neural Network): ${revenue:.2f}")
    print(f"  Running Time: {running_time} minutes")
    print(f"  Popularity: {popularity}")
    print(f"  Release Year: {release_year}")
    print(f"  Vote Count: {vote_count}")
    print("---------------------------")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Neural Network Model Evaluation - Ratings:
MAE: 396270.5339105188
RMSE: 676133.9823990503
R2 Score: -369425529608.6993
---------------------------
Neural Network Model Evaluation - Revenues:
MAE: 56717634.53476562
RMSE: 101658689.37849668
R2 Score: 0.6114225605766006
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Neural Network Predictions:
Movie: Avatar
  Predicted Rating (Neural Network): 871191.50
  Predicted Revenue (Neural Network): $200086048.00
  Running Time: 162.0 minutes
  Popularity: 150.437577
  Release Year: 2009.0
  Vote Count: 11800
---------------------------
Movie: Pirates of the Caribbean: At World's End
  Predicted Rating (Neural Network): 10.54
  Predicted Revenue (Neural Network): $2206765.50
  Running Time: 169.0 minutes
 

In [9]:
import pickle

# Save the models to disk
with open('rating_model.pkl', 'wb') as f:
    pickle.dump(rating_model, f)

with open('revenue_model.pkl', 'wb') as f:
    pickle.dump(revenue_model, f)
