In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import sklearn
from sklearn import model_selection as ms
sklearn.set_config(print_changed_only=False)

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model as lm

# import cupy as cp
# import cudf
# import cuml
# from cuml.preprocessing import OneHotEncoder
# from cuml.preprocessing import MinMaxScaler
# from cuml import train_test_split as cutts

# import dask_ml
# import dask_ml.model_selection as dcv
# from dask_ml.preprocessing import DummyEncoder
# from dask_ml.preprocessing import Categorizer
# from dask_ml.preprocessing import OneHotEncoder
# from dask_ml.preprocessing import MinMaxScaler

import xgboost

import copy

import itertools

import re

import import_ipynb

from _Self_Written_Functions_Sheet import (conditions_edit, nbrhd_rank, drop_nbrhd,
totallivsf_add, drop_grlivarea, bsmtfin_add, drop_bsmtfinsf12, totalporchsf_add,
drop_porches, yrsremodtosold_add, drop_yrsoldremod, fireplace_yes, drop_fireplaces,
pool_yes, drop_poolarea, totalbaths_add, drop_baths, totalbsmtbaths_add,
drop_bsmtbaths)

from _Self_Written_Functions_Sheet import (mlr_model, lasso_model, ridge_model, enet_model, 
                                           rfr_model, gbm_model, xgb_model)

from _Self_Written_Functions_Sheet import (dum_scale, use_og_data, comb_list_generator,
                                           apply_comb_list, rmse)

importing Jupyter notebook from _Self_Written_Functions_Sheet.ipynb


In [2]:
# importing the datasets
# hp_train = pd.read_csv('Proj3_Datasets/hp_2a_ranked_edited_train.csv', index_col=0)
# hp_test = pd.read_csv('Proj3_Datasets/hp_2a_ranked_edited_test.csv', index_col=0)
# hp_logsp = pd.read_csv('Proj3_Datasets/hp_1a_no_imputation_logsaleprice.csv',
#                                 index_col=0)
# hp_saleprice = pd.read_csv('Proj3_Datasets/hp_1a_no_imputation_saleprice.csv')

hp_train = pd.read_csv('..\hp_2a_ranked_edited_train.csv', index_col=0)
hp_test = pd.read_csv('..\hp_2a_ranked_edited_test.csv', index_col=0)
hp_logsp = pd.read_csv('..\hp_1a_no_imputation_logsaleprice.csv', index_col=0)
hp_saleprice = pd.read_csv('..\hp_1a_no_imputation_saleprice.csv', index_col=0)

# saving train index
hp_index = hp_train.index

combo_nonedit = pd.concat([hp_train, hp_test])
combo_nonedit.columns = hp_train.columns

# convert all possible numeric types to numeric
for col in combo_nonedit.columns: 
    combo_nonedit[col] = combo_nonedit[col].astype('float', errors='ignore')

combo_cols = list(combo_nonedit.columns)

combo = copy.deepcopy(combo_nonedit)

In [3]:
def mlr_model(x_trn, y_trn, x_tst, y_tst):
    lm = LinearRegression()
    # parameters to use
    fitintercept = [True, False]
    gparam_mlr_1 = {'fit_intercept': fitintercept}
    # setting parameters
    gs_mlr_1 = ms.GridSearchCV(lm, gparam_mlr_1, cv=n_folds, refit=True, n_jobs=-1,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True)
    gs_mlr_1.fit(x_trn, y_trn)
    mlr_model.test_rmse = rmse(gs_mlr_1, y_tst, x_tst)    
    
def lasso_model(x_trn, y_trn, x_tst, y_tst):
    lasso = Lasso(max_iter=1000000)
    # parameters to use
    alphas = [0.0001, 0.000112, 0.000124, 0.000136, 0.000148]
    fitintercept = [True, False]
    selec = ['cyclic', 'random']
    gparam_lasso_1 = {'alpha': alphas,
                      'fit_intercept':fitintercept,
                      'selection': selec}
    # setting parameters
    gs_lasso_1 = ms.GridSearchCV(lasso, gparam_lasso_1, cv=n_folds, refit=True, n_jobs=-1,
                                 scoring='neg_root_mean_squared_error',
                                 return_train_score=True)
    gs_lasso_1.fit(x_trn, y_trn)
    lasso_model.test_rmse = rmse(gs_lasso_1, y_tst, x_tst)

# def ridge_model(x_trn, y_trn, x_tst, y_tst):
#     ridge = Ridge(random_state=state, max_iter=10000, fit_intercept=True)
#     # parameters to use
#     solvers = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
#     fitintercept = [True, False]
#     gparam_ridge_1 = {'solver': solvers,
#                       'fit_intercept': fitintercept, 
#                       'random_state': [state]}
#     # setting parameters
#     gs_ridge_1 = ms.GridSearchCV(ridge, gparam_ridge_1, cv=n_folds, refit=True, n_jobs=-1,
#                                  scoring='neg_root_mean_squared_error',
#                                  return_train_score=True)
#     gs_ridge_1.fit(x_trn, y_trn)
#     ridge_model.test_rmse = rmse(gs_ridge_1, y_tst, x_tst)
    
def enet_model(x_trn, y_trn, x_tst, y_tst):
    enet = ElasticNet(max_iter=10000000, selection='random')
    # parameters to use
    alphas = [1e-4, 2.5e-4, 5e-4]
    l1ratio = [0.5, 0.7, 1]
    precomputes = [True, False]
    warmstart = [True, False]
    fitintercept = [True, False]
    gparam_enet_1 = {'alpha': alphas,
                     'l1_ratio': l1ratio,
                     'precompute': precomputes, 
                     'warm_start': warmstart,
                     'fit_intercept': fitintercept}
    # setting parameters
    gs_enet_1 = ms.GridSearchCV(enet, gparam_enet_1, cv=n_folds, refit=True, n_jobs=-1,
                                scoring='neg_root_mean_squared_error', 
                                return_train_score=True)
    gs_enet_1.fit(x_trn, y_trn)
    enet_model.test_rmse = rmse(gs_enet_1, y_tst, x_tst)
    
def rfr_model(x_trn, y_trn, x_tst, y_tst):
    rfr = RandomForestRegressor()
    gparam_rfr = {}
    # setting parameters
    gs_rfr = ms.GridSearchCV(rfr, gparam_rfr, cv=n_folds, refit=True, n_jobs=-1,
                             scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_rfr.fit(x_trn, y_trn)
    rfr_model.test_rmse = rmse(gs_rfr, y_tst, x_tst)
    
def gbm_model(x_trn, y_trn, x_tst, y_tst):
    gbm = GradientBoostingRegressor()
    gparam_gbm = {}
    gs_gbm = ms.GridSearchCV(gbm, gparam_gbm, cv=n_folds, refit=True, n_jobs=-1,
                          scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_gbm.fit(x_trn, y_trn)
    gbm_model.test_rmse = rmse(gs_gbm, y_tst, x_tst)
    
def xgb_model(x_trn, y_trn, x_tst, y_tst):
    xgb_t = xgboost.XGBRegressor()
    gparam_xgb_t = {}
    gs_xgb_t = ms.GridSearchCV(xgb_t, gparam_xgb_t, cv=n_folds, refit=True, n_jobs=-1,
                            scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_xgb_t.fit(x_trn, y_trn)
    xgb_model.test_rmse = rmse(gs_xgb_t, y_tst, x_tst)

In [4]:
hp = copy.deepcopy(combo_nonedit)
state = 0
counter = 0

# using standard kfold split
n_folds = ms.KFold(n_splits=5, random_state=state, shuffle=True)

# lists of the functions separated by whether something is changed or added
stuff_add = [nbrhd_rank, totallivsf_add, bsmtfin_add, totalporchsf_add, 
             yrsremodtosold_add, fireplace_yes, pool_yes, totalbaths_add,
             totalbsmtbaths_add, conditions_edit]

# list of functions that will drop columns
stuff_drop = [drop_nbrhd, drop_grlivarea, drop_bsmtfinsf12,  drop_porches,
              drop_yrsoldremod, drop_fireplaces, drop_poolarea,
              drop_baths, drop_bsmtbaths]

# list of models to use
models = [mlr_model, lasso_model,  enet_model, rfr_model, gbm_model, xgb_model]

# using self-written function to generate permutations of 2 powersets
# looking at the max length
len(comb_list_generator(stuff_add, stuff_drop))

92378

In [5]:
# will not be using a function for ease of access for variables

state = 0
counter = 0

# numbers will be indices later down
funcs_scores = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}}

for cb_list in comb_list_generator(stuff_add, stuff_drop):
    
    df = copy.deepcopy(combo_nonedit)

    # converting any numbers to numeric type
    for col in df.columns: 
        df[col] = df[col].astype('float', errors='ignore')

    # converting the single string within the list to a list of strings
    cb_split = re.sub("['']", "", str(cb_list)).strip('][').split(', ')
    # converting the list of strings to a single string
    funcs_used = ', '.join(cb_split)

    # calling all of the functions within the list
    for cb in cb_split:
        eval(cb)(df)
        
    # getting the train dataset for modeling after dummification
    hp_touse = dum_scale(df).loc[hp_index]


    # setting up train and test sets
    xtrain, xtest, ytrain, ytest = ms.train_test_split(hp_touse, hp_logsp,
                                                       test_size=0.2, 
                                                       random_state=state)
    # Must flatten to fit
    ytrain = ytrain.values.flatten()

    # run the models
    for i, model in enumerate(models):
        model(xtrain, ytrain, xtest, ytest)
        # each model has a designated number (the order in the list: models)
        i+=1
        # add scores to the dictionary of dictionaries
        funcs_scores[i][funcs_used] = model.test_rmse

    # keeping track of rounds
    counter += 1
    if counter % 5 == 0:
        print(f'Completed round {counter}')
    
# return funcs_scores

Completed round 5
Completed round 10
Completed round 15
Completed round 20
Completed round 25
Completed round 30
Completed round 35
Completed round 40
Completed round 45
Completed round 50
Completed round 55
Completed round 60
Completed round 65
Completed round 70
Completed round 75
Completed round 80
Completed round 85
Completed round 90
Completed round 95
Completed round 100
Completed round 105
Completed round 110
Completed round 115
Completed round 120
Completed round 125
Completed round 130
Completed round 135
Completed round 140
Completed round 145
Completed round 150
Completed round 155
Completed round 160
Completed round 165
Completed round 170
Completed round 175
Completed round 180
Completed round 185
Completed round 190
Completed round 195
Completed round 200
Completed round 205
Completed round 210
Completed round 215
Completed round 220
Completed round 225
Completed round 230
Completed round 235
Completed round 240
Completed round 245
Completed round 250
Completed round 255


Completed round 2010
Completed round 2015
Completed round 2020
Completed round 2025
Completed round 2030
Completed round 2035
Completed round 2040
Completed round 2045
Completed round 2050
Completed round 2055
Completed round 2060
Completed round 2065
Completed round 2070
Completed round 2075
Completed round 2080
Completed round 2085
Completed round 2090
Completed round 2095
Completed round 2100
Completed round 2105
Completed round 2110
Completed round 2115
Completed round 2120
Completed round 2125
Completed round 2130
Completed round 2135
Completed round 2140
Completed round 2145
Completed round 2150
Completed round 2155
Completed round 2160
Completed round 2165
Completed round 2170
Completed round 2175
Completed round 2180
Completed round 2185
Completed round 2190
Completed round 2195
Completed round 2200
Completed round 2205
Completed round 2210
Completed round 2215
Completed round 2220
Completed round 2225
Completed round 2230
Completed round 2235
Completed round 2240
Completed rou

Completed round 3965
Completed round 3970
Completed round 3975
Completed round 3980
Completed round 3985
Completed round 3990
Completed round 3995
Completed round 4000
Completed round 4005
Completed round 4010
Completed round 4015
Completed round 4020
Completed round 4025
Completed round 4030
Completed round 4035
Completed round 4040
Completed round 4045
Completed round 4050
Completed round 4055
Completed round 4060
Completed round 4065
Completed round 4070
Completed round 4075
Completed round 4080
Completed round 4085
Completed round 4090
Completed round 4095
Completed round 4100
Completed round 4105
Completed round 4110
Completed round 4115
Completed round 4120
Completed round 4125
Completed round 4130
Completed round 4135
Completed round 4140
Completed round 4145
Completed round 4150
Completed round 4155
Completed round 4160
Completed round 4165
Completed round 4170
Completed round 4175
Completed round 4180
Completed round 4185
Completed round 4190
Completed round 4195
Completed rou

Completed round 5920
Completed round 5925
Completed round 5930
Completed round 5935
Completed round 5940
Completed round 5945
Completed round 5950
Completed round 5955
Completed round 5960
Completed round 5965
Completed round 5970
Completed round 5975
Completed round 5980
Completed round 5985
Completed round 5990
Completed round 5995
Completed round 6000
Completed round 6005
Completed round 6010
Completed round 6015
Completed round 6020
Completed round 6025
Completed round 6030
Completed round 6035
Completed round 6040
Completed round 6045
Completed round 6050
Completed round 6055
Completed round 6060
Completed round 6065
Completed round 6070
Completed round 6075
Completed round 6080
Completed round 6085
Completed round 6090
Completed round 6095
Completed round 6100
Completed round 6105
Completed round 6110
Completed round 6115
Completed round 6120
Completed round 6125
Completed round 6130
Completed round 6135
Completed round 6140
Completed round 6145
Completed round 6150
Completed rou

Completed round 7875
Completed round 7880
Completed round 7885
Completed round 7890
Completed round 7895
Completed round 7900
Completed round 7905
Completed round 7910
Completed round 7915
Completed round 7920
Completed round 7925
Completed round 7930
Completed round 7935
Completed round 7940
Completed round 7945
Completed round 7950
Completed round 7955
Completed round 7960
Completed round 7965
Completed round 7970
Completed round 7975
Completed round 7980
Completed round 7985
Completed round 7990
Completed round 7995
Completed round 8000
Completed round 8005
Completed round 8010
Completed round 8015
Completed round 8020
Completed round 8025
Completed round 8030
Completed round 8035
Completed round 8040
Completed round 8045
Completed round 8050
Completed round 8055
Completed round 8060
Completed round 8065
Completed round 8070
Completed round 8075
Completed round 8080
Completed round 8085
Completed round 8090
Completed round 8095
Completed round 8100
Completed round 8105
Completed rou

KeyboardInterrupt: 

In [None]:
for k_1 in funcs_scores:
    k_min = min(funcs_scores[k_1], key=funcs_scores[k_1].get)
    v_min = funcs_scores[k_1][k_min]
    print(k_1)
    print(k_min)
    print(v_min) 
    print('#' * 50)