## Tim's Models

Consists of: CatBoost, Random Forests, Linear Regressor, K-Nearest-Neighbour.

In [1]:
from catboost import CatBoostRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
def normalize_data(df):
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)

In [3]:
# Load data
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')

# Check data
print("Training set ", train.shape)
print("Test set ", test.shape)

Training set  (3000, 3737)
Test set  (4398, 5047)


In [4]:
# Fill remaining NA's with 0 and negatives with 0
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train[train < 0] = 0
test[test < 0] = 0

In [5]:
# Drop ID Column
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [6]:
train.describe()

Unnamed: 0,budget,popularity,runtime,revenue,name_collection,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,...,prodc_Quick Six Entertainment,production_countries_count,release_month,release_day,release_year,spoken_languages_count,is_released,keywords_count,cast_score,crew_score
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,0.059635,0.02875368,0.319095,0.04388234,0.105249,0.510333,0.342667,0.263,0.247,0.190333,...,0.000333,0.165792,0.564611,0.485366,0.991429,0.16137,0.998667,0.048501,0.01179,0.08247
std,0.097438,0.04112293,0.065323,0.09049665,0.241177,0.499977,0.47468,0.440336,0.431339,0.39263,...,0.018257,0.094044,0.284093,0.278483,0.007647,0.098632,0.036497,0.044738,0.034671,0.10074
min,0.0,3.397466e-09,0.0,6.580861e-10,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.083333,0.032258,0.952405,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0136512,0.278107,0.001580427,0.00237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.333333,0.258065,0.988101,0.111111,1.0,0.020134,0.001805,0.016319
50%,0.021053,0.02505584,0.307692,0.0110605,0.00237,1.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.583333,0.483871,0.993555,0.111111,1.0,0.040268,0.004711,0.054681
75%,0.078947,0.03700174,0.349112,0.04526053,0.00237,1.0,1.0,1.0,0.0,0.0,...,0.0,0.125,0.833333,0.709677,0.997025,0.222222,1.0,0.067114,0.010414,0.110075
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
test.describe()

Unnamed: 0,budget,popularity,runtime,name_collection,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Adventure,...,prodc_Wonderful Films PLC,prodc_Native Pictures Productions,prodc_Malofilm,prodc_The Image Organization,prodc_Filmtech,prodc_Smokewood Entertainment Group,prodc_UGC DA International,prodc_Pathé Consortium Cinéma,prodc_Les Films de la Pléiade,cast_score
count,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,...,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0,4398.0
mean,0.087032,0.01561719,0.336014,0.101438,0.487722,0.358572,0.245566,0.226012,0.196453,0.153934,...,0.000227,0.000227,0.000227,0.000227,0.000227,0.000227,0.000227,0.000227,0.000227,0.038009
std,0.141938,0.02230005,0.066554,0.239198,0.499906,0.479636,0.430471,0.418294,0.39736,0.360926,...,0.015079,0.015079,0.015079,0.015079,0.015079,0.015079,0.015079,0.015079,0.015079,0.052093
min,0.0,1.826523e-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.007114646,0.29375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010365
50%,0.028462,0.01366649,0.325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026013
75%,0.107692,0.01997947,0.36875,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04813
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
y = train.revenue
X = train.drop('revenue', axis=1)

In [11]:
print(train['crew_score'].corr(train['revenue']))

0.8868306222141772


In [14]:
# Select Top 50 Best Features
number_of_features = 50
best_features = SelectKBest(score_func=chi2, k=number_of_features)
y = y.astype('int')
fit = best_features.fit(X, y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Specs', 'Score']
print(feature_scores.nlargest(number_of_features, 'Score'))

                                             Specs       Score
139                           prodc_Marvel Studios  373.127042
33                        prodc_Paramount Pictures   16.692771
2496                               prodc_Paramount   15.813084
12                           genre_Science Fiction    8.444309
10                                 genre_Adventure    4.981707
7                                     genre_Action    2.296348
24                                    len_homepage    1.487076
4                                      genre_Drama    0.510504
5                                     genre_Comedy    0.342781
6                                   genre_Thriller    0.263088
8                                    genre_Romance    0.190397
9                                      genre_Crime    0.156385
26                                   ortitle_equal    0.117039
11                                    genre_Horror    0.100367
3727                                 release_month    0

In [15]:
selected_features = feature_scores.nlargest(number_of_features, 'Score')['Specs'].tolist()

In [14]:
X = X[selected_features]
X.describe()

NameError: name 'selected_features' is not defined

In [16]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=.2, random_state=13)

In [17]:
print('Training: ', X_tr.shape)
print('Validation: ', X_val.shape)
print('Test: ', test.shape)

Training:  (2400, 3735)
Validation:  (600, 3735)
Test:  (4398, 5046)


In [18]:
def kfold_validate(X, y, te, model, col):
    scores = np.zeros(k)
    for i, (tr_idx, val_idx) in enumerate(cv.split(X)):
        X_tr, y_tr = X.reindex(tr_idx), y.reindex(tr_idx)
        X_val, y_val = X.reindex(val_idx), y.reindex(val_idx)
        
        X_tr.fillna(-1, inplace=True)
        y_tr.fillna(-1, inplace=True)
        X_val.fillna(-1, inplace=True)
        y_val.fillna(-1, inplace=True)
        

        model.fit(X_tr, y_tr)
        pred_val = model.predict(X_val)
        scores[i] = get_loss(pred_val, y_val)
        print("========={}-th Fold Score: {}".format(i, scores[i]))
        
        meta_tr[col][val_idx] = pred_val
        meta_tr['val'][val_idx] = y_val

    print("=========Total Score: ", np.mean(scores))
    model.fit(X, y)
    meta_te[col] = model.predict(te)

In [19]:
def get_loss(pred, actual):
  loss = mean_squared_error(pred, actual)
  result = np.sqrt(loss)
  return result

In [21]:
k = 5
seed = 10
cv = KFold(n_splits = k, shuffle = True, random_state = seed)

In [22]:
meta_tr = pd.DataFrame(np.zeros((X.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])
meta_tr['val'] = 0
meta_te = pd.DataFrame(np.zeros((test.shape[0], 4)),
                            columns=['catboost', 'random_forest', 'KNN', 'linear_regression'])

### CatBoost

In [23]:
cat_boost = CatBoostRegressor(bagging_temperature = 0.3, loss_function='RMSE', logging_level='Silent',
                             colsample_bylevel = 0.7, depth = 9, early_stopping_rounds = 200, iterations = 1000, eval_metric='RMSE', learning_rate = 0.01)
kfold_validate(X_tr, y_tr, test, cat_boost, 'catboost')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:79: At position 9 should be feature with name genre_Crime (found genre_Adventure).

In [24]:
cat_boost_pred = cat_boost.predict(X_val)
np.sqrt(mean_squared_error(y_val, cat_boost_pred))

0.00011397590254891602

### Random Forest

In [27]:
random_forest = RandomForestRegressor(n_estimators = 3000, max_depth = 9, criterion='mse')
kfold_validate(X_tr, y_tr, test, random_forest, 'random_forest')



KeyboardInterrupt: 

In [None]:
random_forest_pred = random_forest.predict(X_val)
np.sqrt(mean_squared_error(y_val, random_forest_pred))

### Linear Regression

In [28]:
linear_reg = LinearRegression()
kfold_validate(X_tr, y_tr, test, linear_reg, 'linear_regression')



ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3735 is different from 5046)

In [29]:
lin_res = linear_reg.predict(X_val)
np.sqrt(mean_squared_error(y_val, lin_res))

392874980659.62286

### K-NN

In [30]:
knn = KNeighborsRegressor(n_neighbors=10, weights='distance', p=5)
kfold_validate(X_tr, y_tr, test, knn, 'KNN')



ValueError: query data dimension must match training data dimension

In [None]:
knn.kneighbors_graph()

## Ridge Regression