In [31]:
import numpy as np
import pandas as pd
# pd.set_option('display.max_columns', None)

import sklearn
from sklearn import model_selection as ms
# sklearn.set_config(print_changed_only=False)

# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn import linear_model as lm

import cupy as cp
import cudf
import cuml
# from cuml.preprocessing import OneHotEncoder
# from cuml.preprocessing import MinMaxScaler
# from cuml import train_test_split as cutts

import dask_ml
import dask_ml.model_selection as dcv
# from dask_ml.preprocessing import DummyEncoder
# from dask_ml.preprocessing import Categorizer
# from dask_ml.preprocessing import OneHotEncoder
# from dask_ml.preprocessing import MinMaxScaler

import xgboost

import copy

import itertools

import import_ipynb
from _Self_Written_Functions_Sheet import rmse

In [2]:
# importing the datasets
hp_train = pd.read_csv('Proj3_Datasets/hp_2a_ranked_edited_train.csv', index_col=0)
hp_test = pd.read_csv('Proj3_Datasets/hp_2a_ranked_edited_test.csv', index_col=0)
hp_logsp = pd.read_csv('Proj3_Datasets/hp_1a_no_imputation_logsaleprice.csv',
                                index_col=0)
hp_saleprice = pd.read_csv('Proj3_Datasets/hp_1a_no_imputation_saleprice.csv')

# saving train index
hp_index = hp_train.index

combo_nonedit = pd.concat([hp_train, hp_test])
combo_nonedit.columns = hp_train.columns

# convert all possible numeric types to numeric
for col in combo_nonedit.columns: 
    combo_nonedit[col] = combo_nonedit[col].astype('float', errors='ignore')

combo_cols = list(combo_nonedit.columns)

combo = copy.deepcopy(combo_nonedit)

In [3]:
# functions:

def conditions_edit(df):
    # combine Railroad Adjacent and Railroad Within 200 for both railroads and both
    # condition1 and 2
    df.loc[df['Condition1'].str.contains('RRN', na=False), 'Condition1'] = 'RRN'
    df.loc[df['Condition1'].str.contains('RRA', na=False), 'Condition1'] = 'RRA'

    df.loc[df['Condition2'].str.contains('RRN', na=False), 'Condition2'] = 'RRN'
    df.loc[df['Condition2'].str.contains('RRA', na=False), 'Condition2'] = 'RRA'

def nbrhd_rank(df):
    # creating a dictionary of the neigborhoods and ranks based on median value
    list_neigborhoods = list(df['Neighborhood'].unique())
    n_dict_median = {}
    
    # get the median price for the neighborhoods
    for n in list_neigborhoods:
        n_dict_median[n] = hp_saleprice.loc[hp_train\
                                            [hp_train['Neighborhood']==n].index].median()[0]
    
    # sorting the dictionary
    n_dict_median_sort = dict(sorted(n_dict_median.items(), key=lambda item:item[1],
                                     reverse=True))
    n_list_median_sort = list(n_dict_median_sort.keys())
    n_dict_median_ranking = {}
    
    # assigning ranking
    for i, n in enumerate(n_list_median_sort):
        n_dict_median_ranking[n] = i + 1
        
    # creating a column replacing the neighborhood name with rank
    df['NbMedianRank'] = df['Neighborhood'].replace(n_dict_median_ranking)

def drop_nbrhd(df):
    if 'Neighborhood' in df.columns:
        df.drop('Neighborhood', axis=1, inplace = True)

def totallivsf_add(df):
    # creating a column with total square footage
    df.insert(0, 'TotalLivSF', df['GrLivArea'] + df['BsmtFinSF1'] + df['BsmtFinSF2'])

def drop_grlivarea(df):
    if 'GrLivArea' in df.columns:
        df.drop(['GrLivArea'], axis=1, inplace = True)

def bsmtfin_add(df):
    # adding the column for whether basement is finished
    df.insert(0, 'BsmtFin', df['BsmtFinSF1'] + df['BsmtFinSF2'])
    df.loc[df['BsmtFin'] > 0, 'BsmtFin'] = 1 

def drop_bsmtfinsf12(df):
    if set(['BsmtFinSF1', 'BsmtFinSF2']).issubset(df.columns):
        df.drop(['BsmtFinSF1', 'BsmtFinSF2'], axis=1, inplace = True)

def totalporchsf_add(df):
    # adding the total SF of porches
    df.insert(0, 'TotalPorchSF', df['OpenPorchSF'] + df['EnclosedPorch'] +\
              df['3SsnPorch'] + df['ScreenPorch'])

def drop_porches(df):
    if set(['OpenPorchSF', 'EnclosedPorch',
            '3SsnPorch', 'ScreenPorch']).issubset(df.columns):
        df.drop(['OpenPorchSF', 'EnclosedPorch',
                 '3SsnPorch', 'ScreenPorch'], axis=1, inplace = True)

def yrsremodtosold_add(df):
    # calculating the time between remodeling and selling the house
    df.insert(0, 'YrsRemodToSold', df['YrSold'] - df['YearRemodAdd'])

def drop_yrsoldremod(df):
    if set(['YrSold', 'YearRemodAdd']).issubset(df.columns):
        df.drop(['YrSold', 'YearRemodAdd'], axis=1, inplace = True)

def fireplace_yes(df):
    # Replacing number of fireplaces with either yes (1) or no (0)
    df.insert(0, 'Fireplace', [1 if x > 0 else 0 for x in df['Fireplaces']])

def drop_fireplaces(df):
    if 'Fireplaces' in df.columns:
        df.drop(['Fireplaces'], axis=1, inplace = True)

def pool_yes(df): 
    # Replacing pool area with either yes (1) or no (0)
    df.insert(0, 'Pool', [1 if x > 0 else 0 for x in df['PoolArea']])

def drop_poolarea(df):
    if 'PoolArea' in df.columns:
        df.drop(['PoolArea'], axis=1, inplace = True)

def totalbaths_add(df):
    # adding the number of bathrooms
    df.insert(0, 'TotalBaths', df['FullBath'] + df['HalfBath'] * 0.5)

def drop_baths(df):
    if set(['FullBath', 'HalfBath']).issubset(df.columns):
        df.drop(['FullBath', 'HalfBath'], axis=1, inplace = True)
    
def totalbsmtbaths_add(df):
    # adding the number of bathrooms
    df.insert(0, 'TotalBsmtBaths', df['BsmtFullBath'] + df['BsmtHalfBath'] * 0.5)
    
def drop_bsmtbaths(df):
    if set(['BsmtFullBath', 'BsmtHalfBath']).issubset(df.columns):
        df.drop(['BsmtFullBath', 'BsmtHalfBath'], axis=1, inplace = True)

In [35]:
def mlr_model(x_trn, y_trn, x_tst, y_tst):
    lm = LinearRegression()
    # parameters to use
    fitintercept = [True, False]
    gparam_mlr_1 = {'fit_intercept': fitintercept}
    # setting parameters
    gs_mlr_1 = ms.GridSearchCV(lm, gparam_mlr_1, cv=n_folds, refit=True, n_jobs=-1,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True)
    gs_mlr_1.fit(x_trn, y_trn)
    mlr_model.test_rmse = rmse(gs_mlr_1, y_tst, x_tst)    
    
def lasso_model(x_trn, y_trn, x_tst, y_tst):
    lasso = Lasso(max_iter=1000000)
    # parameters to use
    alphas = [0.0001, 0.000112, 0.000124, 0.000136, 0.000148]
    fitintercept = [True, False]
    selec = ['cyclic', 'random']
    gparam_lasso_1 = {'alpha': alphas,
                      'fit_intercept':fitintercept,
                      'selection': selec}
    # setting parameters
    gs_lasso_1 = ms.GridSearchCV(lasso, gparam_lasso_1, cv=n_folds, refit=True, n_jobs=-1,
                                 scoring='neg_root_mean_squared_error',
                                 return_train_score=True)
    gs_lasso_1.fit(x_trn, y_trn)
    lasso_model.test_rmse = rmse(gs_lasso_1, y_tst, x_tst)

# def ridge_model(x_trn, y_trn, x_tst, y_tst):
#     ridge = Ridge(random_state=state, max_iter=10000, fit_intercept=True)
#     # parameters to use
#     solvers = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
#     fitintercept = [True, False]
#     gparam_ridge_1 = {'solver': solvers,
#                       'fit_intercept': fitintercept, 
#                       'random_state': [state]}
#     # setting parameters
#     gs_ridge_1 = ms.GridSearchCV(ridge, gparam_ridge_1, cv=n_folds, refit=True, n_jobs=-1,
#                                  scoring='neg_root_mean_squared_error',
#                                  return_train_score=True)
#     gs_ridge_1.fit(x_trn, y_trn)
#     ridge_model.test_rmse = rmse(gs_ridge_1, y_tst, x_tst)
    
def enet_model(x_trn, y_trn, x_tst, y_tst):
    enet = ElasticNet(max_iter=10000000, selection='random')
    # parameters to use
    alphas = [1e-4, 2.5e-4, 5e-4]
    l1ratio = [0.5, 0.7, 1]
    precomputes = [True, False]
    warmstart = [True, False]
    fitintercept = [True, False]
    gparam_enet_1 = {'alpha': alphas,
                     'l1_ratio': l1ratio,
                     'precompute': precomputes, 
                     'warm_start': warmstart,
                     'fit_intercept': fitintercept}
    # setting parameters
    gs_enet_1 = ms.GridSearchCV(enet, gparam_enet_1, cv=n_folds, refit=True, n_jobs=-1,
                                scoring='neg_root_mean_squared_error', 
                                return_train_score=True)
    gs_enet_1.fit(x_trn, y_trn)
    enet_model.test_rmse = rmse(gs_enet_1, y_tst, x_tst)
    
def rfr_model(x_trn, y_trn, x_tst, y_tst):
    rfr = RandomForestRegressor()
    gparam_rfr = {}
    # setting parameters
    gs_rfr = ms.GridSearchCV(rfr, gparam_rfr, cv=n_folds, refit=True, n_jobs=-1,
                             scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_rfr.fit(x_trn, y_trn)
    rfr_model.test_rmse = rmse(gs_rfr, y_tst, x_tst)
    
def gbm_model(x_trn, y_trn, x_tst, y_tst):
    gbm = GradientBoostingRegressor()
    gparam_gbm = {}
    gs_gbm = ms.GridSearchCV(gbm, gparam_gbm, cv=n_folds, refit=True, n_jobs=-1,
                          scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_gbm.fit(x_trn, y_trn)
    gbm_model.test_rmse = rmse(gs_gbm, y_tst, x_tst)
    
def xgb_model(x_trn, y_trn, x_tst, y_tst):
    xgb_t = xgboost.XGBRegressor()
    gparam_xgb_t = {}
    gs_xgb_t = ms.GridSearchCV(xgb_t, gparam_xgb_t, cv=n_folds, refit=True, n_jobs=-1,
                            scoring='neg_root_mean_squared_error', return_train_score=True)
    gs_xgb_t.fit(x_trn, y_trn)
    xgb_model.test_rmse = rmse(gs_xgb_t, y_tst, x_tst)

In [5]:
# def dum_scale():
#     # dummify the variables
#     hp_dum = DummyEncoder().fit_transform(combo, drop_first=True)
#     enc = OneHotEncoder()
#     for feat in ['MoSold', 'YrSold']:
#         if feat in hp_dum.columns:
#             # encode the feature
#             enc.fit(hp_dum[[feat]])
#             # create array and then dataframe of the array
#             oh_labels = enc.transform(hp_dum[[feat]]).toarray()
#             mssubcl_dum = cudf.DataFrame(oh_labels)
#             mssubcl_dum.columns = enc.get_feature_names_out([feat])
#             # Concatenate the dataframes and drop Id and original MSSubClass
#             hp_dum = cudf.concat([hp_dum, mssubcl_dum], axis=1, sort=False)
#             hp_dum = hp_dum.drop([feat], axis=1)
#     hp_dum_cols = list(hp_dum.columns)
#     for col in hp_dum_cols:
#         hp_dum[col] = MinMaxScaler().fit_transform(cudf.array(hp_dum[col]).reshape(-1,1))
#         hp_dum.columns = hp_dum_cols
#     dum_scale.hp_dum = copy.deepcopy(hp_dum)

In [22]:
def dum_scale(df):
    # dummify the variables
    hp_dum = pd.get_dummies(df, drop_first=True)
    hp_dum_cols = list(hp_dum.columns)
    # scale the dataset
    for col in hp_dum_cols:
        hp_dum[col] = MinMaxScaler().fit_transform(np.array(hp_dum[col]).reshape(-1,1))
        hp_dum.columns = hp_dum_cols
    hp_dum_scale = copy.deepcopy(hp_dum)
    return hp_dum_scale

In [7]:
def use_og_data(df):
    return df

def comb_list_generator(list_add, list_drop):
    '''
    This function generates a powerset for each list 
    and then generates all permutations of the 2 lists
    '''
    subset_list = []
    comb_list = []
    #https://stackoverflow.com/questions/464864/how-to-get-all-possible-combinations-of
    #-a-list-s-elements
    #https://stackoverflow.com/questions/61313027/python-executing-all-permutations-of
    #-list-of-functions

    # getting all the combinations of a list for both add and drop
    for r_a, r_b in zip(range(len(stuff_add)+1), range(len(stuff_add)+1)):
        com_list_a = list(itertools.combinations(stuff_add, r_a))
        com_list_b = list(itertools.combinations(stuff_drop, r_b))

        # getting all permutations of the 2 lists
        for subset in itertools.product(com_list_a, com_list_b):

            # adding the names of the functions we run thru to a list to track
            if subset == ((), ()):
                funcs_used_list = ['No changes to dataset']
            else:
                funcs_used_list = str(subset).split()[1::4]

            # if the list of functions is not in the list, run the functions
                # this is mostly to keep track of each permutation
            if funcs_used_list not in subset_list:
                # getting the functions used to add to dictionary
                funcs_used = ', '.join(str(subset).split()[1::4])

                # making the dict key sensible
                if funcs_used == '())':
                    funcs_used = 'use_og_data'
                comb_list.append([funcs_used])
            else:
                continue
                
    return comb_list

In [37]:
hp = copy.deepcopy(combo_nonedit)
state = 0
counter = 0

# using standard kfold split
n_folds = ms.KFold(n_splits=5, random_state=state, shuffle=True)

# lists of the functions separated by whether something is changed or added
stuff_add = [nbrhd_rank, totallivsf_add, bsmtfin_add, totalporchsf_add, 
             yrsremodtosold_add, fireplace_yes, pool_yes, totalbaths_add,
             totalbsmtbaths_add, conditions_edit]

# list of functions that will drop columns
stuff_drop = [drop_nbrhd, drop_grlivarea, drop_bsmtfinsf12,  drop_porches,
              drop_yrsoldremod, drop_fireplaces, drop_poolarea,
              drop_baths, drop_bsmtbaths]

# list of models to use
models = [mlr_model, lasso_model,  enet_model, rfr_model, gbm_model, xgb_model]

# using self-written function to generate permutations of 2 powersets
comb_list_generator(stuff_add, stuff_drop)

[['use_og_data'],
 ['nbrhd_rank, drop_nbrhd'],
 ['nbrhd_rank, drop_grlivarea'],
 ['nbrhd_rank, drop_bsmtfinsf12'],
 ['nbrhd_rank, drop_porches'],
 ['nbrhd_rank, drop_yrsoldremod'],
 ['nbrhd_rank, drop_fireplaces'],
 ['nbrhd_rank, drop_poolarea'],
 ['nbrhd_rank, drop_baths'],
 ['nbrhd_rank, drop_bsmtbaths'],
 ['totallivsf_add, drop_nbrhd'],
 ['totallivsf_add, drop_grlivarea'],
 ['totallivsf_add, drop_bsmtfinsf12'],
 ['totallivsf_add, drop_porches'],
 ['totallivsf_add, drop_yrsoldremod'],
 ['totallivsf_add, drop_fireplaces'],
 ['totallivsf_add, drop_poolarea'],
 ['totallivsf_add, drop_baths'],
 ['totallivsf_add, drop_bsmtbaths'],
 ['bsmtfin_add, drop_nbrhd'],
 ['bsmtfin_add, drop_grlivarea'],
 ['bsmtfin_add, drop_bsmtfinsf12'],
 ['bsmtfin_add, drop_porches'],
 ['bsmtfin_add, drop_yrsoldremod'],
 ['bsmtfin_add, drop_fireplaces'],
 ['bsmtfin_add, drop_poolarea'],
 ['bsmtfin_add, drop_baths'],
 ['bsmtfin_add, drop_bsmtbaths'],
 ['totalporchsf_add, drop_nbrhd'],
 ['totalporchsf_add, drop_grl

In [29]:
# numbers will be indices later down
funcs_scores = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}}

def apply_comb_list(com_list):
    for cb_l in com_list:

        hp = copy.deepcopy(combo_nonedit)

        # converting any numbers to numeric type
        for col in combo.columns: 
            combo[col] = combo[col].astype('float', errors='ignore')

        # converting these 2 cols to string bc of difficulties with OneHotEncoding using Dask
        mo_yr = ['MoSold', 'YrSold']
        for my in mo_yr:
            if my in combo.columns:
                combo[[my]] = combo[[my]].astype(str)

        for cb in cb_l:
            eval(cb)(combo)

#         # run dummification and scaling
#         dum_scale(combo)

        # getting the train dataset for modeling
        hp_touse = dum_scale(combo).loc[hp_index]

        # setting up train and test sets
        xtrain, xtest, ytrain, ytest = ms.train_test_split(hp_touse, hp_logsp,
                                                           test_size=0.2, 
                                                           random_state=state)
        # Must flatten to fit
        ytrain = ytrain.values.flatten()

        # run the models
        for i, model in enumerate(models):
            model(xtrain, ytrain, xtest, ytest)
            # add 1 for dictionaries w/in dictionary
            i+=1
            # add scores to dictionary
            funcs_scores[i][cb] = model.test_rmse

        counter += 1
        print(f'Completed round {counter}')
    return funcs_scores

In [38]:
apply_comb_list(comb_list_generator(stuff_add, stuff_drop))

ValueError: source code string cannot contain null bytes