## Models

The following is a stacked model. This constists of:
LEVEL 0: Catboost, Random Forests, Linear Regressor, KNN, Ridge Regression, SVM, ElasticNet, XGMBoost
LEVEL 1: Linear Regression (As the meta learning model) which uses the StackingRegressor to find the best model.

In [1]:
from catboost import CatBoostRegressor
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn import preprocessing
from numpy import mean
import xgboost as xgb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import copy

In [2]:
# Suppress Warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
# Load data
train = pd.read_csv('final_train.csv')
test = pd.read_csv('final_test.csv')

In [None]:
# Fix differing features
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset)
train = copy.copy(dataset[:train_objs_num])
test = copy.copy(dataset[train_objs_num:])

In [None]:
# Check data
print("Training set ", train.shape)
print("Test set ", test.shape)

In [None]:
# Fill remaining NA's with 0 and negatives with 0
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train[train < 0] = 0
test[test < 0] = 0

In [None]:
# Drop ID Column
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
# train.describe()

In [None]:
y = train.revenue
X = train.drop('revenue', axis=1)

In [None]:
z = train.sort_values('revenue', ascending=False)
z['revenue']

In [None]:
# Select Top 50 Best Features
number_of_features = 50
best_features = SelectKBest(score_func=chi2, k=number_of_features)
y = y.astype('int')
fit = best_features.fit(X, y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Specs', 'Score']
print(feature_scores.nlargest(number_of_features, 'Score'))

In [None]:
selected_features = feature_scores.nlargest(number_of_features, 'Score')['Specs'].tolist()

In [None]:
X = X[selected_features]
X.describe()

In [None]:
test = test[selected_features]

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=.2, random_state=13)

In [None]:
k = 5

In [None]:
print('Training: ', X_tr.shape)
print('Validation: ', X_val.shape)
print('Test: ', test.shape)

##### Prep to determine best alpha for some models

In [None]:
# Find the alpha with best value (here we choose 0.001 for ridge regression)
alphas = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,0.5, 1]
for a in alphas:
    model = Ridge(alpha=a, normalize=True).fit(X,y) 
    score = model.score(X, y)
    pred_y = model.predict(X)
    mse = mean_squared_error(y, pred_y) 
    print("Alpha:{0:.6f}, R2:{1:.3f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, score, mse, np.sqrt(mse)))

In [None]:
# Best alpha for elasticNet is 0.001
for a in alphas:
    model = ElasticNet(alpha=a, normalize=True).fit(X,y) 
    score = model.score(X, y)
    pred_y = model.predict(X)
    mse = mean_squared_error(y, pred_y) 
    print("Alpha:{0:.6f}, R2:{1:.3f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, score, mse, np.sqrt(mse)))

In [None]:
# Prepare the stack
def get_stack():
    layer1 = list()
    layer1.append(('cat_boost', CatBoostRegressor(loss_function='RMSE', logging_level='Silent', depth = 9, early_stopping_rounds = 200, iterations = 1000, eval_metric='RMSE', learning_rate = 0.01)))
    layer1.append(('random_forests', RandomForestRegressor(n_estimators = 3000, max_depth = 9, criterion='mse')))
    layer1.append(('linear_reg', LinearRegression()))
    layer1.append(('knn', KNeighborsRegressor(n_neighbors=10, weights='distance', p=5)))
    layer1.append(('ridge_reg', Ridge(alpha=a, normalize=True)))
    layer1.append(('svr', SVR(kernel='rbf',C=2.0, epsilon=0.2, gamma='auto')))
    layer1.append(('elastic_net', ElasticNet(alpha=a, normalize=True)))
    layer1.append(('xgm', xgb.XGBRegressor()))
    layer2 = list()
    layer2.append(('random_forests2', RandomForestRegressor(n_estimators = 3000, max_depth = 9, criterion='mse')))
    layer2.append(('decision_tree', DecisionTreeRegressor(min_samples_leaf=5, criterion='mse', max_depth=9)))
    layer3 = StackingRegressor(estimators = layer2, final_estimator=LinearRegression(), cv=k)
    model = StackingRegressor(estimators=layer1, final_estimator=layer3, cv=k)
    return model

In [None]:
# Compare with just models themselves without stacking
def get_models():
    models = dict()
    models['cat_boost'] = CatBoostRegressor(loss_function='RMSE', logging_level='Silent', depth = 9, early_stopping_rounds = 200, iterations = 1000, eval_metric='RMSE', learning_rate = 0.01)
    models['random_forests'] = RandomForestRegressor(n_estimators = 3000, max_depth = 9, criterion='mse')
    models['linear_reg'] = LinearRegression()
    models['knn'] = KNeighborsRegressor(n_neighbors=10, weights='distance', p=5)
    models['ridge_reg'] = Ridge(alpha=0.001, normalize=True)
    models['svr'] = SVR(kernel='rbf',C=2.0, epsilon=0.2, gamma='auto')
    models['elastic_net'] = ElasticNet(alpha=0.001, normalize=True)
    models['xgm'] = xgb.XGBRegressor()
#     models['stacked'] = get_stack()
    return models

In [None]:
# Cross Validation
def evaluate_model(model):
    cv = KFold(n_splits = k, random_state = 10, shuffle=True)
    scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=1, error_score='raise')
    return scores

In [None]:
# Workaround to bug in CatBoostRegressor not having attribute n_features_in_
class CatBoostRegressor(CatBoostRegressor):
    def n_features_in_(self):
        return self.get_feature_count()

In [None]:
# Workaround to bug in StackingRegressor not having attribute final_estimator_
class StackingRegressor(StackingRegressor):
    def final_estimator_(self):
        return self.final_estimator_

In [None]:
# Show which model is better
models = get_models()
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(-scores)
    names.append(name)
    print("SCORE OF {} === {}".format(name, -mean(scores)))

In [None]:
# Plots of all models
plt.figure(figsize=(15,15))
plt.boxplot(results, labels=names, showmeans=True)
plt.xlabel('Models', fontsize=12)
plt.ylabel('RMSE', fontsize=12)
plt.suptitle('Performance Of Different Models', fontsize=14)
plt.show()

In [None]:
# Train and predict stacked model
stack = get_stack()
stack.fit(X, y)
predictions = stack.predict(test)

In [None]:
# Submission
submission = pd.read_csv('Data/sample_submission.csv')
submission['revenue'] = np.expm1(predictions)
submission.to_csv('submission.csv', index = False)