In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
import pandas as pd

In [157]:
reduced = pd.read_csv("../reduced_noot.csv")

In [158]:
reduced = reduced.drop(columns = reduced.columns[0])

In [11]:
# reduced.scores = reduced.scores ** .5
# reduced.tmhalfsc = reduced.tmhalfsc ** .5
# reduced.predscores = reduced.predscores ** .5

In [159]:
base_X = reduced.drop(columns='scores')
# base_X = reduced[['predscores','tmhalfsc','opphalfsc','airya','yaca','psd']]
base_y = reduced["scores"]

In [94]:
def rmse(pred, y):
    n = len(y)
    RMSE = sqrt(sum((pred - y)**2) / n)
    return RMSE

In [148]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [140]:
sklearn.metrics.SCORERS

{'explained_variance': make_scorer(explained_variance_score),
 'r2': make_scorer(r2_score),
 'neg_median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
 'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
 'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
 'neg_mean_squared_log_error': make_scorer(mean_squared_log_error, greater_is_better=False),
 'accuracy': make_scorer(accuracy_score),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True),
 'balanced_accuracy': make_scorer(balanced_accuracy_score),
 'average_precision': make_scorer(average_precision_score, needs_threshold=True),
 'neg_log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True),
 'brier_score_loss': make_scorer(brier_score_loss, greater_is_better=False, needs_proba=True),
 'adjusted_rand_score': make_scorer(adjusted_rand_score),
 'homogeneity_score': make_scorer(homogeneity_score),
 'completeness_s

In [146]:
grid_params = dict(min_samples_leaf=np.linspace(.005,.015,num=4), n_estimators=range(90,101,5))
gs = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid_params, cv=8, scoring='r2')
gs.fit(base_X, base_y)
gs.best_params_

{'min_samples_leaf': 0.011666666666666665, 'n_estimators': 95}

In [135]:
grid_params = dict(min_samples_leaf=np.linspace(.01,.1,num=10))
gs = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=grid_params, cv=10, scoring='neg_mean_squared_error')
gs.fit(base_X, base_y)
gs.best_params_

{'min_samples_leaf': 0.05000000000000001}

In [154]:
def fit_Random_Cactus(X, y, best_params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    base_rf = RandomForestRegressor(**best_params)
    base_rf.fit(X_train, y_train)
    base_rf_pred = base_rf.predict(X_test)
    base_rf_RMSE = rmse(base_rf_pred, y_test)
    print(f"Decision Tree Regressor RMSE: {base_rf_RMSE:.3f}")
    print(f"Random Forest Regressor R^2: {base_rf.score(X_test, y_test):.3f}")
    
    leaf_id_trees = base_rf.apply(X_train)
    regressors = dict()
    for i, leaves in enumerate(leaf_id_trees.transpose()):
        regressors_tree = dict()
        leaf_nodes = set(leaves)
        for leaf in leaf_nodes:
            leaf_sample_X = X_train[leaves==leaf]
            leaf_sample_y = y_train[leaves==leaf]
            leaf_regression = Lasso()
            leaf_regression.fit(leaf_sample_X, leaf_sample_y)
            regressors_tree[leaf] = leaf_regression
        regressors[i] = regressors_tree
    
    leaf_id_test_trees = base_rf.apply(X_test)
    y_pred = [[] for _ in range(leaf_id_test_trees.shape[0])]
    for i, leaves in enumerate(leaf_id_test_trees.transpose()):
        for j, leaf in enumerate(leaves):
            leaf_regression = regressors[i][leaf]
            pred_j = leaf_regression.predict(pd.DataFrame(X_test.iloc[j,:]).transpose())
            y_pred[j].append(pred_j[0])
    y_pred = [sum(row)/len(row) for row in y_pred]
        
    print(f"Random Cactus RMSE: {rmse(y_pred, y_test):.3f}")
    print(f"Random Cactus R^2: {r2_score(y_test, y_pred):.3f}")

In [150]:
def fit_Lasso_tree(X, y, best_params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    base_dt = DecisionTreeRegressor(**best_params)
    base_dt.fit(X_train, y_train)
    base_dt_pred = base_dt.predict(X_test)
    base_dt_RMSE = rmse(base_dt_pred, y_test)
    print(f"Decision Tree Regressor RMSE: {base_dt_RMSE:.3f}")
    
    leaf_id = base_dt.apply(X_train)
    regressors = dict()
    leaf_nodes = set(leaf_id)
    for leaf in leaf_nodes:
        leaf_sample_X = X_train[leaf_id==leaf]
        leaf_sample_y = y_train[leaf_id==leaf]
        leaf_regression = Lasso()
        leaf_regression.fit(leaf_sample_X, leaf_sample_y)
        regressors[leaf] = leaf_regression
    
    leaf_id_test = base_dt.apply(X_test)
    y_pred = []
    for i, leaf in enumerate(leaf_id_test):
        leaf_regression = regressors[leaf]
        pred_i = leaf_regression.predict(pd.DataFrame(X_test.iloc[i,:]).transpose())
        y_pred.append(pred_i[0])
        
    print(f"Lasso Tree RMSE: {rmse(y_pred, y_test):.3f}")
    print(f"Lasso Tree R^2: {r2_score(y_test, y_pred):.3f}")

In [74]:
fit_Lasso_tree(base_X, base_y, {'min_samples_leaf':100})
fit_Lasso_tree(base_X, base_y, gs.best_params_)

Decision Tree Regressor RMSE: 0.828
Lasso Tree RMSE: 0.824
Decision Tree Regressor RMSE: 0.830
Lasso Tree RMSE: 0.827


In [160]:
# fit_Random_Cactus(base_X, base_y, {'min_samples_leaf':75, 'n_estimators':40})
fit_Random_Cactus(base_X, base_y, gs.best_params_)

Decision Tree Regressor RMSE: 6.868
Random Forest Regressor R^2: 0.549
Random Cactus RMSE: 6.837
Random Cactus R^2: 0.553


In [155]:
X_train, X_test, y_train, y_test = train_test_split(base_X, base_y, test_size=0.25)
lasso = LinearRegression()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
rmse(y_pred, y_test)

6.935778304814304

In [156]:
lasso.coef_

array([ 0.55611743,  0.92786832,  0.04041201,  0.11809028,  0.0062149 ,
       -0.04541861])

In [134]:
X_test.shape

(735, 15)

In [21]:
"""Hand Picked Variables from R:
predscores+tmhalfsc+patt+ypa+ratt+ypr+sackyds"""
manual = reduced[["scores", "predscores", "tmhalfsc", "patt", "ypa", "ratt", "ypr", "sackyds"]]

In [22]:
man_X = manual.drop(columns="scores")

In [23]:
man_y = manual["scores"]

In [24]:
man_X_train, man_X_test, man_y_train, man_y_test = train_test_split(man_X, man_y, test_size=0.25)

In [25]:
man_rf = RandomForestRegressor(n_estimators=100)
man_rf.fit(man_X_train, man_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [26]:
man_rf_pred = man_rf.predict(man_X_test)

In [28]:
man_rf_RMSE = rmse(man_rf_pred, man_y_test)
man_rf_r2 = man_rf.score(man_X_test, man_y_test)
print(f"Manual Random Forest Regressor RMSE: {man_rf_RMSE:.3f}")
print(f"Manual Random Forest Regressor R^2: {man_rf_r2:.3f}")

Manual Random Forest Regressor RMSE: 0.821
Manual Random Forest Regressor R^2: 0.534


In [29]:
man_dt = DecisionTreeRegressor()
man_dt.fit(man_X_train, man_y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [30]:
man_dt_pred = man_dt.predict(man_X_test)

In [31]:
man_dt_RMSE = rmse(man_dt_pred, man_y_test)
man_dt_r2 = man_dt.score(man_X_test, man_y_test)
print(f"Manual Decision Tree Regressor RMSE: {man_dt_RMSE:.3f}")
print(f"Manual Decision Tree Regressor R^2: {man_dt_r2:.3f}")

Manual Decision Tree Regressor RMSE: 1.128
Manual Decision Tree Regressor R^2: 0.119


## Grid Search for Manual Random Forest Regressor

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold

In [33]:
rf = RandomForestRegressor()
search = {"criterion":["mse"], "n_estimators":list(range(95, 105, 1)), 
         "max_depth":[5], "max_features":["auto"]}

In [34]:
rf_gs = GridSearchCV(rf, search, cv=10)

In [35]:
rf_gs.fit(man_X_train, man_y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['mse'], 'n_estimators': [95, 96, 97, 98, 99, 100, 101, 102, 103, 104], 'max_depth': [5], 'max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
best_prms = rf_gs.best_params_
for param in best_prms:
    print(param, " -> ", best_prms[param])

criterion  ->  mse
max_depth  ->  5
max_features  ->  auto
n_estimators  ->  96


In [37]:
best_rf = RandomForestRegressor(criterion="mse", max_depth=5, max_features="auto", n_estimators=97)
best_rf.fit(man_X_train, man_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=97, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
best_rf_pred = best_rf.predict(man_X_test)

In [39]:
best_rf_RMSE = rmse(best_rf_pred, man_y_test)
best_rf_r2 = best_rf.score(man_X_test, man_y_test)
print(f"Best Random Forest Regressor RMSE: {best_rf_RMSE:.3f}")
print(f"Best Random Forest Regressor R^2: {best_rf_r2:.3f}")

Best Random Forest Regressor RMSE: 0.811
Best Random Forest Regressor R^2: 0.545


## Transformation of tmhalfsc

In [5]:
reset -fs

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
import pandas as pd

In [7]:
reduced = pd.read_csv("../data/reduced.csv")
reduced = reduced.drop(columns = reduced.columns[0])

In [8]:
X = reduced.drop(columns="scores")
y = reduced["scores"]

In [9]:
tmhalf = np.array(X.tmhalfsc)

In [10]:
sqrt_tmhalf = np.sqrt(tmhalf)
log_tmhalf = []
for i in range(len(tmhalf)):
    if tmhalf[i] != 0:
        log_tmhalf.append(np.log(tmhalf[i]))
    else:
        log_tmhalf.append(tmhalf[i])

In [37]:
X = X.drop(columns="tmhalfsc")
X["tmhalfsc"] = sqrt_tmhalf
# X["tmhalfsc"] = log_tmhalf

In [38]:
def rmse(pred, y):
    n = len(y)
    RMSE = np.sqrt(sum((pred - y)**2) / n)
    return RMSE

In [39]:
rmse_scores = []
r2_scores = []
rand_state = range(1,100)
for i in rand_state:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)
    rf = RandomForestRegressor(n_estimators=97, criterion="mse", max_depth=5, max_features="auto")
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    r2_scores.append(rf.score(X_test, y_test))
    rmse_scores.append(rmse(pred, y_test))

In [40]:
scores = list(zip(rand_state, rmse_scores, r2_scores))

In [41]:
best_state = sorted(scores, key=lambda x: x[1])[0]
print("Models sorted on RMSE ascending:")
print(f"Random State: {best_state[0]}")
print(f"RMSE: {best_state[1]:.4f}")
print(f"R^2: {best_state[2]:.4f}")

Models sorted on RMSE ascending:
Random State: 69
RMSE: 0.7571
R^2: 0.5941


In [42]:
print(f"Mean RMSE: {np.mean(rmse_scores)}")
print(f"Median RMSE: {np.median(rmse_scores)}")
print(f"Mean R^2: {np.mean(r2_scores)}")
print(f"Median R^2: {np.median(r2_scores)}")

Mean RMSE: 0.8253399573556494
Median RMSE: 0.824496993601382
Mean R^2: 0.5485405368747742
Median R^2: 0.5521310309815644


In [43]:
man_X = X[["predscores", "tmhalfsc", "patt", "ypa", "ratt", "ypr", "sackyds"]]
man_y = y

In [44]:
man_rmse_scores = []
man_r2_scores = []
for i in rand_state:
    man_X_train, man_X_test, man_y_train, man_y_test = train_test_split(man_X, man_y, test_size=0.25, random_state=i)
    man_rf = RandomForestRegressor(n_estimators=97, criterion="mse", max_depth=5, max_features="auto")
    man_rf.fit(man_X_train, man_y_train)
    man_pred = man_rf.predict(man_X_test)
    man_r2_scores.append(man_rf.score(man_X_test, man_y_test))
    man_rmse_scores.append(rmse(man_pred, man_y_test))

In [45]:
man_scores = list(zip(rand_state, man_rmse_scores, man_r2_scores))

In [46]:
best_state = sorted(man_scores, key=lambda x: x[1])[0]
print("Manual models sorted on RMSE ascending:")
print(f"Random State: {best_state[0]}")
print(f"Manual RMSE: {best_state[1]:.4f}")
print(f"Manual R^2: {best_state[2]:.4f}")

Manual models sorted on RMSE ascending:
Random State: 69
Manual RMSE: 0.7546
Manual R^2: 0.5968


In [47]:
print(f"Manual mean RMSE: {np.mean(man_rmse_scores)}")
print(f"Manual median RMSE: {np.median(man_rmse_scores)}")
print(f"Manual mean R^2: {np.mean(man_r2_scores)}")
print(f"Manual median R^2: {np.median(man_r2_scores)}")

Manual mean RMSE: 0.8267525649229137
Manual median RMSE: 0.8281975914892602
Manual mean R^2: 0.5469124796970818
Manual median R^2: 0.5498250473093915
