## Tim's Models

Consists of: CatBoost, Random Forests, Linear Regressor, K-Nearest-Neighbour.

In [1]:
from catboost import CatBoostRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
# Load data
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')

# Check data
print("Training set ", train.shape)
print("Test set ", test.shape)

Training set  (3000, 3738)
Test set  (4398, 5048)


In [4]:
# Fill remaining NA's with 0 and negatives with 0
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train[train < 0] = 0
test[test < 0] = 0

In [5]:
# Drop ID Column
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [6]:
train.describe()

Unnamed: 0,budget,popularity,runtime,revenue,name_collection,budget_log,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,...,prodc_Quick Six Entertainment,production_countries_count,release_month,release_day,release_year,spoken_languages_count,is_released,keywords_count,cast_score,crew_score
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,11.959606,1.793891,4.648978,15.970738,1.02636,2.037452,0.510333,0.342667,0.263,0.247,...,0.000333,0.203871,1.72257,2.440693,7.600729,0.26115,0.998667,1.621991,20.144832,17.074726
std,7.405678,0.863576,0.347045,3.045649,2.088409,1.241217,0.499977,0.47468,0.440336,0.431339,...,0.018257,0.385399,0.701168,0.875534,0.007766,0.437214,0.036497,0.923586,2.661406,2.503348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.560601,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.390797,4.543295,14.691625,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.386294,2.079442,7.597396,0.0,1.0,1.098612,19.608674,16.530195
50%,15.894952,1.998076,4.644391,16.63731,0.0,2.766002,1.0,0.0,0.0,0.0,...,0.0,0.0,1.94591,2.70805,7.6029,0.0,1.0,1.791759,20.56788,17.739403
75%,17.216708,2.387935,4.770685,18.046365,0.0,2.84588,1.0,1.0,1.0,0.0,...,0.0,0.0,2.302585,3.091042,7.606387,0.693147,1.0,2.302585,21.361155,18.43905
max,19.755682,5.684725,5.823046,21.141685,6.045005,2.983441,1.0,1.0,1.0,1.0,...,1.0,2.079442,2.484907,3.433987,7.609367,2.197225,1.0,5.003946,25.925794,20.645642


In [8]:
y = train.revenue
X = train.drop('revenue', axis=1)

In [9]:
z = train.sort_values('revenue', ascending=False)
z['revenue']

1126    21.141685
1761    21.132889
2770    21.063590
684     20.956666
2322    20.839934
          ...    
695      0.693147
1917     0.000000
1874     0.000000
1754     0.000000
347      0.000000
Name: revenue, Length: 3000, dtype: float64

In [12]:
print(train['budget'].corr(train['revenue']))

0.5345104574056031


In [13]:
# Select Top 50 Best Features
number_of_features = 50
best_features = SelectKBest(score_func=chi2, k=number_of_features)
y = y.astype('int')
fit = best_features.fit(X, y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Specs', 'Score']
print(feature_scores.nlargest(number_of_features, 'Score'))

                                                  Specs        Score
0                                                budget  4390.389619
3                                       name_collection  1631.334064
2644        prodc_Québec Production Services Tax Credit   999.000000
2645                    prodc_Abu Dhabi Film Commission   999.000000
2646  prodc_Colorado Office of Film, Television & Media   999.000000
1160                         prodc_Chongoing Film Group   749.000000
1161                       prodc_Bon Voyage Film Studio   749.000000
1162                         prodc_Shanghai Media Group   749.000000
1163      prodc_Zhejiang Films & TV(Group) Company Ltd.   749.000000
1164                    prodc_Hunan Broadcasting System   749.000000
1165                     prodc_Anhui Broadcasting Corp.   749.000000
1166                           prodc_Beijing TV Station   749.000000
1858                      prodc_Centerstage Productions   749.000000
1859  prodc_Film Development Counc

In [14]:
selected_features = feature_scores.nlargest(number_of_features, 'Score')['Specs'].tolist()

In [15]:
X = X[selected_features]
X.describe()

Unnamed: 0,budget,name_collection,prodc_Québec Production Services Tax Credit,prodc_Abu Dhabi Film Commission,"prodc_Colorado Office of Film, Television & Media",prodc_Chongoing Film Group,prodc_Bon Voyage Film Studio,prodc_Shanghai Media Group,prodc_Zhejiang Films & TV(Group) Company Ltd.,prodc_Hunan Broadcasting System,...,prodc_Pandemonium,prodc_Lightstream Entertainment,prodc_iFeatures,prodc_Sixty Six Pictures,prodc_Oldgarth Media,prodc_Кинокомпания «Lunapark»,prodc_Инвада фильм,prodc_ABS-CBN Film Productions,prodc_Unitel Classics,prodc_Adel Productions
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,11.959606,1.02636,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,...,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333,0.000333
std,7.405678,2.088409,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,...,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257,0.018257
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.894952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17.216708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,19.755682,6.045005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=.2, random_state=13)

In [17]:
print('Training: ', X_tr.shape)
print('Validation: ', X_val.shape)
print('Test: ', test.shape)

Training:  (2400, 50)
Validation:  (600, 50)
Test:  (4398, 5047)


In [18]:
def kfold_validate(X, y, te, model, col):
    scores = np.zeros(k)
    for i, (tr_idx, val_idx) in enumerate(cv.split(X)):
        X_tr, y_tr = X.reindex(tr_idx), y.reindex(tr_idx)
        X_val, y_val = X.reindex(val_idx), y.reindex(val_idx)
        
        X_tr.fillna(-1, inplace=True)
        y_tr.fillna(-1, inplace=True)
        X_val.fillna(-1, inplace=True)
        y_val.fillna(-1, inplace=True)
        

        model.fit(X_tr, y_tr)
        pred_val = model.predict(X_val)
        scores[i] = get_loss(pred_val, y_val)
        print("========={}-th Fold Score: {}".format(i, scores[i]))
        
        meta_tr[col][val_idx] = pred_val
        meta_tr['val'][val_idx] = y_val

    print("=========Total Score: ", np.mean(scores))
    model.fit(X, y)
    meta_te[col] = model.predict(te)

In [19]:
def get_loss(pred, actual):
  loss = mean_squared_error(pred, actual)
  result = np.sqrt(loss)
  return result

In [20]:
k = 5
seed = 10
cv = KFold(n_splits = k, shuffle = True, random_state = seed)

In [21]:
meta_tr = pd.DataFrame(np.zeros((X.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])
meta_tr['val'] = 0
meta_te = pd.DataFrame(np.zeros((test.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])

### CatBoost

In [22]:
cat_boost = CatBoostRegressor(bagging_temperature = 0.3, loss_function='RMSE', logging_level='Silent',
                             colsample_bylevel = 0.7, depth = 9, early_stopping_rounds = 200, iterations = 1000, eval_metric='RMSE', learning_rate = 0.01)
kfold_validate(X_tr, y_tr, test, cat_boost, 'catboost')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:79: At position 1 should be feature with name name_collection (found popularity).

In [23]:
cat_boost_pred = cat_boost.predict(X_val)
np.sqrt(mean_squared_error(y_val, cat_boost_pred))

1.949315376306365

### Random Forest

In [24]:
random_forest = RandomForestRegressor(n_estimators = 3000, max_depth = 9, criterion='mse')
kfold_validate(X_tr, y_tr, test, random_forest, 'random_forest')



ValueError: Number of features of the model must match the input. Model n_features is 50 and input n_features is 5047 

In [25]:
random_forest_pred = random_forest.predict(X_val)
np.sqrt(mean_squared_error(y_val, random_forest_pred))

1.9610065667610548

### Linear Regression -- WORST PERFORMER

In [29]:
linear_reg = LinearRegression()
kfold_validate(X_tr, y_tr, test, linear_reg, 'linear_regression')



ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 50 is different from 5047)

In [30]:
lin_res = linear_reg.predict(X_val)
np.sqrt(mean_squared_error(y_val, lin_res))

68046279061.02303

### K-NN

In [31]:
knn = KNeighborsRegressor(n_neighbors=10, weights='distance', p=5)
kfold_validate(X_tr, y_tr, test, knn, 'KNN')



ValueError: query data dimension must match training data dimension

In [32]:
knn_pred = knn.predict(X_val)
np.sqrt(mean_squared_error(y_val, knn_pred))

2.134488765827412

## Ridge Regression

In [None]:
# To be explored later on