In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_validate

In [3]:
df_ers_ccqb = pd.read_excel('data/ers_ccqb.xlsx')
df_ers_scores1 = pd.read_excel('data/ers_rating_scores.xlsx')
df_ers_scores2 = pd.read_excel('data/ers_rating_scores.xlsx', sheet_name=1, header=1)

In [119]:
from DataCleaning import CleanErs

In [120]:
data_transformer = CleanErs()

In [121]:
X, y = data_transformer.fit_transform_train(df_ers_ccqb, df_ers_scores1, df_ers_scores2)

### split the data to do testing

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

### make a basic random forest model

In [54]:
forest_model = RandomForestRegressor(1000)

In [50]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Let's see how it does

In [51]:
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(model.score(X_test, y_test))

0.4000272541443217
0.0721392319971097


The scoring is ok, but I need to do some testing to see how well it's really doing

In [77]:
forest_model = RandomForestRegressor(n_estimators=1000, max_depth=None, max_features="sqrt")
validated_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error', cv=10, return_train_score=False, n_jobs=-1)

In [80]:
n_estimators = [1000, 2000, 3000]
max_depth = [None, 1000, 750, 460, 100]
max_features = ["auto", "sqrt", "log2"]
parameter_string = "n_estimators: {0} max_depth: {1} max_features: {2} MSE average: {3}"

for parameter1 in n_estimators:
    for parameter2 in max_depth:
        for parameter3 in max_features:
            forest_model = RandomForestRegressor(n_estimators=parameter1,max_depth=parameter2,
                                                 max_features=parameter3)
            validate_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error',
                                           cv=10, return_train_score=False, n_jobs=-1)
            print(parameter_string.format(parameter1, parameter2, parameter3,
                                         -validate_dict['test_score'].mean()))

n_estimators: 1000 max_depth: None max_features: auto MSE average: 0.3937968914975195
n_estimators: 1000 max_depth: None max_features: sqrt MSE average: 0.3876791353908898
n_estimators: 1000 max_depth: None max_features: log2 MSE average: 0.3860071221631916
n_estimators: 1000 max_depth: 1000 max_features: auto MSE average: 0.3937968914975195
n_estimators: 1000 max_depth: 1000 max_features: sqrt MSE average: 0.3876791353908898
n_estimators: 1000 max_depth: 1000 max_features: log2 MSE average: 0.3860071221631916
n_estimators: 1000 max_depth: 750 max_features: auto MSE average: 0.3937968914975195
n_estimators: 1000 max_depth: 750 max_features: sqrt MSE average: 0.3876791353908898
n_estimators: 1000 max_depth: 750 max_features: log2 MSE average: 0.3860071221631916
n_estimators: 1000 max_depth: 460 max_features: auto MSE average: 0.3937968914975195
n_estimators: 1000 max_depth: 460 max_features: sqrt MSE average: 0.3876791353908898
n_estimators: 1000 max_depth: 460 max_features: log2 MSE av

In [84]:
forest_model = RandomForestRegressor(n_estimators=1500, max_features="log2")
validated_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error', cv=10, return_train_score=False, n_jobs=-1)

In [86]:
-validated_dict['test_score'].mean()

0.3849977414055831

looks like it might be getting close to good parameters. I want to try some values between 1000 and 2000 trees and then try some different sizes for max_features

In [87]:
n_estimators = [1200, 1400, 1500, 1600, 1800]
max_features = ["log2", 0.5, 0.3, 0.2, 0.15]
parameter_string = "n_estimators: {0} max_features: {1} MSE average: {2}"

for parameter1 in n_estimators:
    for parameter2 in max_features:
        forest_model = RandomForestRegressor(n_estimators=parameter1,
                                             max_features=parameter2)
        validate_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error',
                                       cv=10, return_train_score=False, n_jobs=-1)
        print(parameter_string.format(parameter1, parameter2,
                                     -validate_dict['test_score'].mean()))

n_estimators: 1200 max_features: log2 MSE average: 0.385529492730194
n_estimators: 1200 max_features: 0.5 MSE average: 0.3933335224639375
n_estimators: 1200 max_features: 0.3 MSE average: 0.39130497319638285
n_estimators: 1200 max_features: 0.2 MSE average: 0.38998661925882333
n_estimators: 1200 max_features: 0.15 MSE average: 0.3887110163707049
n_estimators: 1400 max_features: log2 MSE average: 0.3847549891528976
n_estimators: 1400 max_features: 0.5 MSE average: 0.393126831090831
n_estimators: 1400 max_features: 0.3 MSE average: 0.3912087999654847
n_estimators: 1400 max_features: 0.2 MSE average: 0.38986257661559504
n_estimators: 1400 max_features: 0.15 MSE average: 0.38837889432857015
n_estimators: 1500 max_features: log2 MSE average: 0.3849977414055831
n_estimators: 1500 max_features: 0.5 MSE average: 0.3931938542618878
n_estimators: 1500 max_features: 0.3 MSE average: 0.39109716856766225
n_estimators: 1500 max_features: 0.2 MSE average: 0.3897377939233824
n_estimators: 1500 max_fea

I'll go with 1400 estimators and log2 max features.

## Time to try gradient boosting

In [89]:
boosting_model = GradientBoostingRegressor()
boosting_model.fit(X_train, y_train)
y_pred = boosting_model.predict(X_test)
mean_squared_error(y_test, y_pred)

0.4200478218611781

In [96]:
loss = ["ls", "lad", "huber"]
learning_rate = 0.01 
n_estimators = [100, 250, 500, 750, 1000]
max_depth = [2, 3, 4, 5]
max_features = "log2"

In [94]:
parameter_string = "loss: {0} n_estimators: {1} MSE average: {2}"
for parameter1 in loss:
    for parameter2 in n_estimators:
        boosting_model = GradientBoostingRegressor(learning_rate=learning_rate,
                                                   loss=parameter1, n_estimators=parameter2)
        validate_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error',
                                       cv=10, return_train_score=False)
        print(parameter_string.format(parameter1, parameter2,
                                     -validate_dict['test_score'].mean()))

loss: ls n_estimators: 1000 MSE average: 0.38791894813782335
loss: ls n_estimators: 2000 MSE average: 0.38948479066100794
loss: ls n_estimators: 3000 MSE average: 0.38863983747721637
loss: lad n_estimators: 1000 MSE average: 0.3891073312150219
loss: lad n_estimators: 2000 MSE average: 0.38896020158203115
loss: lad n_estimators: 3000 MSE average: 0.38903723457857187
loss: huber n_estimators: 1000 MSE average: 0.3900348989995601
loss: huber n_estimators: 2000 MSE average: 0.3880024467666072
loss: huber n_estimators: 3000 MSE average: 0.3890386910737357


ls and lad seemed to be about the same, I'll stick with ls for now. can't tell if n_estimators is having much of an affect at this level.

In [95]:
parameter_string = "max_depth: {0} n_estimators: {1} MSE average: {2}"
for parameter1 in max_depth:
    for parameter2 in n_estimators:
        boosting_model = GradientBoostingRegressor(learning_rate=learning_rate,
                                                   max_depth=parameter1, n_estimators=parameter2)
        validate_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error',
                                       cv=10, return_train_score=False)
        print(parameter_string.format(parameter1, parameter2,
                                     -validate_dict['test_score'].mean()))

max_depth: 2 n_estimators: 1000 MSE average: 0.3894534829875524
max_depth: 2 n_estimators: 2000 MSE average: 0.38938071143190506
max_depth: 2 n_estimators: 3000 MSE average: 0.38885260461333415
max_depth: 3 n_estimators: 1000 MSE average: 0.38822283156938264
max_depth: 3 n_estimators: 2000 MSE average: 0.38808865813534643
max_depth: 3 n_estimators: 3000 MSE average: 0.3893165792573016
max_depth: 4 n_estimators: 1000 MSE average: 0.38937872256921413
max_depth: 4 n_estimators: 2000 MSE average: 0.38849149733651483
max_depth: 4 n_estimators: 3000 MSE average: 0.38918310938380585
max_depth: 5 n_estimators: 1000 MSE average: 0.38794836646414277
max_depth: 5 n_estimators: 2000 MSE average: 0.3892198376859769
max_depth: 5 n_estimators: 3000 MSE average: 0.3894034395987023


doesn't look very conclusive of anything. Try less trees and maybe try log2 max_features instead of the default.

In [97]:
parameter_string = "max_depth: {0} n_estimators: {1} MSE average: {2}"
for parameter1 in max_depth:
    for parameter2 in n_estimators:
        boosting_model = GradientBoostingRegressor(learning_rate=learning_rate,
                                                   max_depth=parameter1, n_estimators=parameter2,
                                                   max_features="log2")
        validate_dict = cross_validate(forest_model, X, y, scoring='neg_mean_squared_error',
                                       cv=10, return_train_score=False)
        print(parameter_string.format(parameter1, parameter2,
                                     -validate_dict['test_score'].mean()))

max_depth: 2 n_estimators: 100 MSE average: 0.3878849579598998
max_depth: 2 n_estimators: 250 MSE average: 0.38973504001131165
max_depth: 2 n_estimators: 500 MSE average: 0.3899019957906008
max_depth: 2 n_estimators: 750 MSE average: 0.3890610550796522
max_depth: 2 n_estimators: 1000 MSE average: 0.3890948966165174
max_depth: 3 n_estimators: 100 MSE average: 0.38883206871280607
max_depth: 3 n_estimators: 250 MSE average: 0.3884782459029975
max_depth: 3 n_estimators: 500 MSE average: 0.38872551846572934
max_depth: 3 n_estimators: 750 MSE average: 0.3884772171133755
max_depth: 3 n_estimators: 1000 MSE average: 0.3893136762475028
max_depth: 4 n_estimators: 100 MSE average: 0.3884498030660323
max_depth: 4 n_estimators: 250 MSE average: 0.388854174086819
max_depth: 4 n_estimators: 500 MSE average: 0.3894516883002986
max_depth: 4 n_estimators: 750 MSE average: 0.3887176658725605
max_depth: 4 n_estimators: 1000 MSE average: 0.387692589533175
max_depth: 5 n_estimators: 100 MSE average: 0.38755