# Purpose:
- This notebook builds a decision tree model that will be used to predict movie box office revenue
- __[Original dataset on Kaggle](https://www.kaggle.com/rounakbanik/the-movies-dataset/data)__

In [14]:
#loading necessary packages
import csv
import math
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import random
from sklearn.metrics import r2_score

%matplotlib inline

Loading in the clean dataset generated from the EDA/Data Cleansing notebook

In [2]:
movies_features = pd.read_csv('movies_clean.csv')
movies_features.head()

Unnamed: 0,budget,revenue,runtime,vote_average,release_timespan,popularity_scaled,10402,10749,10751,10752,...,27,28,35,36,37,53,80,878,9648,99
0,30000000.0,373554033.0,81.0,7.7,23.0,3.088628,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,65000000.0,262797249.0,104.0,6.9,23.0,2.834127,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16000000.0,81452156.0,127.0,6.1,23.0,1.350536,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,60000000.0,187436818.0,170.0,7.7,23.0,2.886192,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,35000000.0,64350171.0,106.0,5.5,23.0,1.654713,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### <font color = "DarkMagenta">Creating the train/test dataset</font>

In [3]:
movies_train, movies_test = train_test_split(movies_features, test_size=0.2, random_state=33)

In [4]:
movies_train.head()

Unnamed: 0,budget,revenue,runtime,vote_average,release_timespan,popularity_scaled,10402,10749,10751,10752,...,27,28,35,36,37,53,80,878,9648,99
3816,3200000.0,5332926.0,105.0,6.8,8.0,1.906039,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3377,15000000.0,35097815.0,92.0,6.8,9.0,2.189455,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3120,85000000.0,222231186.0,88.0,5.9,10.0,2.401432,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
635,70000000.0,164000000.0,98.0,5.6,20.0,2.908224,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1949,30000000.0,33013805.0,113.0,6.6,16.0,2.427549,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
X = movies_train[['budget','runtime','vote_average','release_timespan','popularity_scaled',
 ' 10402',' 10749',' 10751',' 10752',' 10769',' 10770',' 12',' 14',' 16',' 18',' 27',' 28',' 35',' 36',' 37',
 ' 53',' 80',' 878',' 9648',' 99']]
y = movies_train['revenue']

In [6]:
X_test = movies_test[['budget','runtime','vote_average','release_timespan','popularity_scaled',' 10402',' 10749',
 ' 10751',' 10752',' 10769',' 10770',' 12',' 14',' 16',' 18',' 27',' 28',' 35',' 36',' 37',' 53',' 80',' 878',' 9648',' 99']]
y_test = movies_test['revenue']

### <font color = "DarkMagenta">Fitting a decision tree</font>

In [12]:
movies_treefit = DecisionTreeRegressor(max_depth = 8,min_samples_leaf = 7)
movies_treefit = movies_treefit.fit(X,y)

In [15]:
#training R-squared
y_true = movies_train['revenue']
y_pred = movies_treefit.predict(X)
train_rsquared = r2_score(y_true, y_pred) 
train_rsquared

0.78461298707190541

In [16]:
#test R-squared
y_true_test = movies_test['revenue']
y_pred_test = movies_treefit.predict(X_test)
test_rsquared = r2_score(y_true_test, y_pred_test)
test_rsquared

0.56579218187532365

### Saving Decision Tree Model

In [120]:
import pickle

In [123]:
movie_model_path = 'movie_tree.pkl'
 
# Create an variable to pickle and open it in write mode
model_pickle = open(movie_model_path, 'wb')
pickle.dump(movies_treefit, model_pickle)
model_pickle.close()

### <font color='MediumOrchid'>Other related notebooks:</font>
- **Movies Metadata - EDA & Data Cleansing:** this notebook contains the exploratory data analysis and data cleansing/manipulation on the movies metadata dataset  
- **Movie genre mapping:** function that returns the dummy variable code based on the genre name
- **Build prediction frame:** function that builds a dataframe which can be used as input to make new movie revenue predictions