## Stacked Model

Consists of: CatBoost, Random Forests, Linear Regressor, K-Nearest-Neighbour. These models are layered twice.
Uses: Cross Validation, Ensemble Learning

In [1]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Load data
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')

# Check data
print("Training set ", train.shape)
print("Test set ", test.shape)

Training set  (3000, 3737)
Test set  (4398, 5047)


In [3]:
# Fill remaining NA's with 0
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [4]:
train.iloc[:, :45].describe()

Unnamed: 0,id,budget,popularity,runtime,revenue,name_collection,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,...,prodc_Paramount Pictures,prodc_Twentieth Century Fox Film Corporation,prodc_Columbia Pictures,prodc_Metro-Goldwyn-Mayer (MGM),prodc_New Line Cinema,prodc_Touchstone Pictures,prodc_Walt Disney Pictures,prodc_Columbia Pictures Corporation,prodc_TriStar Pictures,prodc_Relativity Media
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1500.5,22661350.0,8.463274,107.854,66681760.0,44.415,0.510333,0.342667,0.263,0.247,...,0.053667,0.046,0.051333,0.028,0.025,0.021,0.020667,0.020333,0.017667,0.016
std,866.169729,37026620.0,12.104,22.079293,137514900.0,101.776485,0.499977,0.47468,0.440336,0.431339,...,0.225396,0.20952,0.220714,0.165,0.156151,0.143408,0.14229,0.141161,0.131759,0.125496
min,1.0,0.0,1e-06,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,750.75,0.0,4.018053,94.0,2401550.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1500.5,8000000.0,7.374861,104.0,16807070.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2250.25,30000000.0,10.890983,118.0,68775990.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3000.0,380000000.0,294.337037,338.0,1519558000.0,422.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
test.iloc[:, :45].describe()

Unnamed: 0,id,budget,popularity,runtime,name_collection,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,...,prodc_Columbia Pictures,prodc_Metro-Goldwyn-Mayer (MGM),prodc_New Line Cinema,prodc_Touchstone Pictures,prodc_Walt Disney Pictures,prodc_Canal+,prodc_Columbia Pictures Corporation,prodc_TriStar Pictures,prodc_Relativity Media,prodc_Miramax Films
count,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,...,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0
mean,5199.5,22628240.0,8.55023,107.524329,56.3995,0.487722,0.358572,0.245566,0.226012,0.196453,...,0.050932,0.027967,0.027967,0.021601,0.019327,0.022738,0.017963,0.015462,0.015234,0.014552
std,1269.737571,36903760.0,12.209014,21.297309,132.994321,0.499906,0.479636,0.430471,0.418294,0.39736,...,0.219884,0.164898,0.164898,0.145392,0.137687,0.149083,0.132831,0.123394,0.122497,0.119765
min,3001.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4100.25,0.0,3.895186,94.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5199.5,7400000.0,7.482241,104.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6298.75,28000000.0,10.938524,118.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7398.0,260000000.0,547.488298,320.0,556.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
y = train.revenue
X = train.drop('revenue', axis=1)

In [7]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=.2, random_state=13)

In [8]:
print('Training: ', X_tr.shape)
print('Validation: ', X_val.shape)
print('Test: ', test.shape)

Training:  (2400, 3736)
Validation:  (600, 3736)
Test:  (4398, 5047)


### Layer 1 Of Stack

In [9]:
layer1_train = pd.DataFrame(np.zeros((X.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])
layer1_test = pd.DataFrame(np.zeros((test.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])

### CatBoost

In [10]:
cat_boost = CatBoostRegressor(depth = 9, early_stopping_rounds = 10, iterations = 100, learning_rate = .01)
cat_boost.fit(X_tr, y_tr)

0:	learn: 134623564.1790154	total: 114ms	remaining: 11.3s
1:	learn: 133775319.1677480	total: 180ms	remaining: 8.81s
2:	learn: 132801969.2033287	total: 256ms	remaining: 8.27s
3:	learn: 131999349.5277055	total: 306ms	remaining: 7.34s
4:	learn: 131136014.0843286	total: 369ms	remaining: 7.02s
5:	learn: 130239067.6728986	total: 422ms	remaining: 6.61s
6:	learn: 129368249.0323100	total: 475ms	remaining: 6.31s
7:	learn: 128510576.8163266	total: 531ms	remaining: 6.11s
8:	learn: 127645838.8731744	total: 592ms	remaining: 5.99s
9:	learn: 126814426.5027736	total: 678ms	remaining: 6.1s
10:	learn: 125896865.6101202	total: 730ms	remaining: 5.91s
11:	learn: 125052539.3080050	total: 785ms	remaining: 5.75s
12:	learn: 124172325.5000217	total: 906ms	remaining: 6.06s
13:	learn: 123439545.9047323	total: 962ms	remaining: 5.91s
14:	learn: 122710479.7575530	total: 1.02s	remaining: 5.8s
15:	learn: 121908144.5638812	total: 1.08s	remaining: 5.66s
16:	learn: 121031801.1649321	total: 1.15s	remaining: 5.63s
17:	learn

<catboost.core.CatBoostRegressor at 0x7fb2c11c0750>

In [11]:
cat_boost_pred = cat_boost.predict(X_val)
np.sqrt(mean_squared_error(y_val, cat_boost_pred))

87916223.24379937

### Random Forest

In [14]:
random_forest = RandomForestRegressor(n_estimators = 1000, max_depth = 9)
random_forest.fit(X_tr, y_tr)

RandomForestRegressor(max_depth=9, n_estimators=1000)

In [15]:
random_forest_pred = random_forest.predict(X_val)
np.sqrt(mean_squared_error(y_val, random_forest_pred))

50532268.94795798

### Linear Regression

In [18]:
linear_reg = LinearRegression(normalize = True).fit(X_tr, y_tr)
linear_reg.score(X_tr, y_tr) # Score of 0.913 ??? Fake news ???

0.9128384964598992

In [17]:
linear_reg.predict(X_val)

array([-4.64247639e+21,  5.30997316e+21,  2.18626175e+07, -2.15911573e+07,
       -2.91559072e+07,  3.77184885e+08, -7.21757611e+21, -9.33539046e+21,
       -1.05057845e+22,  1.06651212e+22,  6.32204627e+21,  5.04524932e+07,
        5.79176566e+07, -4.83623304e+21,  2.18972025e+22,  5.41389128e+07,
       -2.94280493e+21, -6.69830163e+06,  1.17088473e+08,  3.86125885e+22,
        2.75025328e+21, -1.90753711e+22,  2.48086747e+22, -1.87923469e+22,
        6.01103831e+07,  2.96319077e+06,  9.12838371e+20,  1.13033112e+07,
        6.17799467e+07,  7.18609901e+22,  4.25449680e+07,  1.53388336e+07,
        5.87070641e+21, -2.41751806e+21, -4.80568922e+06,  4.36063147e+06,
        2.57382168e+22,  1.26590306e+22, -5.03873191e+22,  5.51593095e+07,
        4.46606597e+07, -9.00627260e+20,  3.14470541e+07, -2.30802619e+22,
        1.72978760e+22, -3.67883767e+21,  2.84637377e+07,  3.47824015e+22,
        6.45045682e+21,  5.08800171e+07,  1.38112013e+22,  3.29549938e+21,
       -3.19527625e+20, -

### K-NN

In [20]:
knn = KNeighborsRegressor(n_neighbors=20, weights='distance', p=5).fit(X_tr, y_tr)

array([2.78109539e+07, 1.00871654e+07, 7.39217630e+06, 3.62205244e+04,
       5.98076435e+05, 3.91813962e+08, 1.26244451e+07, 4.82188945e+05,
       2.06352610e+05, 1.97573549e+05, 4.28158652e+07, 1.47850866e+07,
       1.55390019e+07, 9.26280828e+05, 2.85820007e+07, 1.57866659e+07,
       2.69222933e+05, 1.77640792e+07, 9.90328707e+07, 1.25764687e+08,
       3.50694169e+07, 2.01377533e+08, 1.56611586e+07, 7.49659251e+07,
       2.50919197e+07, 1.97449374e+07, 1.17231950e+08, 7.13811751e+06,
       1.20939754e+07, 4.69644973e+06, 1.25077669e+07, 9.11739468e+06,
       3.22315406e+07, 9.93111983e+07, 4.31113467e+06, 6.91573446e+06,
       5.65299895e+07, 2.00607773e+08, 1.15833096e+07, 1.24927798e+06,
       2.43413587e+07, 6.96907530e+06, 2.50000748e+06, 3.57567557e+07,
       3.97171764e+07, 7.58590292e+07, 2.73832061e+07, 2.65251531e+07,
       8.14961589e+06, 7.69803697e+07, 8.66529397e+07, 9.59805596e+06,
       1.35404310e+07, 7.95178620e+07, 2.98255986e+07, 9.29933054e+06,
      

In [22]:
knn.kneighbors_graph()

<2400x2400 sparse matrix of type '<class 'numpy.float64'>'
	with 48000 stored elements in Compressed Sparse Row format>