# Important Feature Exploration

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from matplotlib import pyplot as plt
import seaborn as sns

import copy

In [2]:
# importing important features lists
lasso_list = pd.read_csv('..\hp_1c_important_feat_lasso.csv', index_col=0)
enet_list = pd.read_csv('..\hp_1c_important_feat_enet.csv', index_col=0)
rfr_list = pd.read_csv('..\hp_1c_important_feat_rfr.csv', index_col=0)
gbm_list = pd.read_csv('..\hp_1c_important_feat_gbr.csv', index_col=0)
xgb_list = pd.read_csv('..\hp_1c_important_feat_xgb.csv', index_col=0)
# full_list = pd.read_csv('..\hp_1c_important_feat_full.csv', index_col=0)

# importing the datasets
hp_train = pd.read_csv('..\hp_1b_null_impute_train.csv', index_col=0)
hp_test = pd.read_csv('..\hp_1b_null_impute_test.csv', index_col=0)

# saving train index
hp_index = list(hp_train.index)

combo = pd.DataFrame(np.concatenate([hp_train, hp_test]))
combo.columns = hp_train.columns

for col in combo.columns: 
    combo[col] = combo[col].astype('float', errors='ignore')
    
combo_cols = list(combo.columns)

In [3]:
# create list of dfs
df_list = [lasso_list, enet_list, rfr_list, gbm_list, xgb_list]
# https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns
for df in df_list:
    df.columns = ['FullFeat']
    # create additional columns for values in the features
    df[['MainFeat', 'SubFeat']] = df['FullFeat'].str.split('_', 1, expand=True)
    # replace null values with empty space instead of None
    df.loc[df['SubFeat'].isnull(), 'SubFeat'] = ''

In [4]:
lasso_tokeep = set(lasso_list['MainFeat'])
enet_tokeep = set(enet_list['MainFeat'])
rfr_tokeep = set(rfr_list['MainFeat'])
gbm_tokeep = set(gbm_list['MainFeat'])
xgb_tokeep = set(xgb_list['MainFeat'])
# full_tokeep = set(full_list['MainFeat'])

In [5]:
# https://www.pythonpool.com/python-list-intersection/
intx_tokeep = sorted(list(lasso_tokeep & enet_tokeep & rfr_tokeep & gbm_tokeep & xgb_tokeep))
intx_tokeep

['BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'Condition1',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageQual',
 'GarageType',
 'HeatingQC',
 'KitchenQual',
 'LotShape',
 'MSSubClass',
 'MSZoning',
 'MasVnrType',
 'MoSold',
 'Neighborhood',
 'PavedDrive',
 'RoofStyle',
 'SaleCondition',
 'SaleType']

# Transforming Data to Numerical

# Street

In [6]:
combo.loc[combo['Street']=='Pave', 'Street'] = 1
combo.loc[combo['Street']=='Grvl', 'Street'] = 0

combo[['Street']].head()

Unnamed: 0,Street
0,1
1,1
2,1
3,1
4,1


# Alley

In [7]:
combo.loc[combo['Alley']=='Pave', 'Alley'] = 2
combo.loc[combo['Alley']=='Grvl', 'Alley'] = 1
combo.loc[combo['Alley']=='None', 'Alley'] = 0

combo[['Alley']].head()

Unnamed: 0,Alley
0,0
1,0
2,0
3,0
4,0


# Utilites

In [8]:
combo.loc[combo['Utilities']=='AllPub', 'Utilities'] = 3
combo.loc[combo['Utilities']=='NoSewr', 'Utilities'] = 2
combo.loc[combo['Utilities']=='NoSeWa', 'Utilities'] = 1
combo.loc[combo['Utilities']=='ELO', 'Utilities'] = 0

combo[['Utilities']].head()

Unnamed: 0,Utilities
0,3
1,3
2,3
3,3
4,3


# Central Air

In [9]:
combo.loc[combo['CentralAir']=='Y', 'CentralAir'] = 1
combo.loc[combo['CentralAir']=='N', 'CentralAir'] = 0

combo[['CentralAir']].head()

Unnamed: 0,CentralAir
0,1
1,1
2,1
3,1
4,1


# PavedDrive

In [10]:
# change to ranked feature
rank = ['Y', 'P', 'N']

for i, r in enumerate(rank):
    combo.loc[combo['PavedDrive']==r, 'PavedDrive'] = len(rank)-i
        
combo[['PavedDrive']].head()

Unnamed: 0,PavedDrive
0,3
1,3
2,3
3,3
4,3


# PoolQC

In [11]:
# change to ranked feature
rank = ['Ex', 'Gd', 'TA', 'Fa', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['PoolQC']==r, 'PoolQC'] = len(rank)-(i+1)
        
combo[['PoolQC']].head()

Unnamed: 0,PoolQC
0,0
1,0
2,0
3,0
4,0


# Fence

In [12]:
# change to ranked feature
rank = ['GdPrv', 'GdWo', 'MnPrv', 'MnWw', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['Fence']==r, 'Fence'] = len(rank)-(i+1)
        
combo[['Fence']].head()

Unnamed: 0,Fence
0,0
1,0
2,0
3,0
4,0


# 'ExterQual',  'ExterCond', 'HeatingQC',  'KitchenQual'

In [13]:
# change to ranked feature
rank = ['Ex', 'Gd', 'TA', 'Fa', 'Po']

for i, r in enumerate(rank):
    combo.loc[combo['ExterQual']==r, 'ExterQual'] = len(rank)-i
    combo.loc[combo['ExterCond']==r, 'ExterCond'] = len(rank)-i
    combo.loc[combo['HeatingQC']==r, 'HeatingQC'] = len(rank)-i
    combo.loc[combo['KitchenQual']==r, 'KitchenQual'] = len(rank)-i
    
        
combo[['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual']].head()

Unnamed: 0,ExterQual,ExterCond,HeatingQC,KitchenQual
0,4,3,5,4
1,3,3,5,3
2,4,3,5,4
3,3,3,4,4
4,4,3,5,4


# 'BsmtQual',  'BsmtCond',  'GarageQual',  'GarageCond'

In [14]:
# change to ranked feature
rank = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['BsmtQual']==r, 'BsmtQual'] = len(rank)-(i+1)
    combo.loc[combo['BsmtCond']==r, 'BsmtCond'] = len(rank)-(i+1)
    combo.loc[combo['GarageQual']==r, 'GarageQual'] = len(rank)-(i+1)
    combo.loc[combo['GarageCond']==r, 'GarageCond'] = len(rank)-(i+1)
        
combo[['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']].head()

Unnamed: 0,BsmtQual,BsmtCond,GarageQual,GarageCond
0,4,3,3,3
1,4,3,3,3
2,4,3,3,3
3,3,4,3,3
4,4,3,3,3


In [15]:
# change to ranked feature
rank = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['BsmtQual']==r, 'BsmtQual'] = len(rank)-(i+1)
    combo.loc[combo['BsmtCond']==r, 'BsmtCond'] = len(rank)-(i+1)
    combo.loc[combo['GarageQual']==r, 'GarageQual'] = len(rank)-(i+1)
    combo.loc[combo['GarageCond']==r, 'GarageCond'] = len(rank)-(i+1)
        
combo[['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']].head()

Unnamed: 0,BsmtQual,BsmtCond,GarageQual,GarageCond
0,4,3,3,3
1,4,3,3,3
2,4,3,3,3
3,3,4,3,3
4,4,3,3,3


# 'BsmtExposure'

In [16]:
# change to ranked feature
rank = ['Gd', 'Av', 'Mn', 'No', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['BsmtExposure']==r, 'BsmtExposure'] = len(rank)-(i+1)
        
combo[['BsmtExposure']].head()

Unnamed: 0,BsmtExposure
0,1
1,4
2,2
3,1
4,3


# 'BsmtFinType1',  'BsmtFinType2'

In [17]:
# change to ranked feature
rank = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['BsmtFinType1']==r, 'BsmtFinType1'] = len(rank)-(i+1)
    combo.loc[combo['BsmtFinType2']==r, 'BsmtFinType2'] = len(rank)-(i+1)
        
combo[['BsmtFinType1', 'BsmtFinType2']].head()

Unnamed: 0,BsmtFinType1,BsmtFinType2
0,6,1
1,5,1
2,6,1
3,5,1
4,6,1


# 'Electrical'

In [18]:
combo[combo['Electrical']=='Mix']['Neighborhood']

398    IDOTRR
Name: Neighborhood, dtype: object

In [19]:
combo_elec = pd.DataFrame(combo[combo['Neighborhood']=='IDOTRR']\
                          ['Electrical'].value_counts()).reset_index()
combo_elec.columns = ['Type', 'Count']
combo_elec

Unnamed: 0,Type,Count
0,SBrkr,60
1,FuseA,22
2,FuseF,8
3,FuseP,2
4,Mix,1


In [20]:
# change to ranked feature
rank = ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']

for i, r in enumerate(rank):
    combo.loc[combo['Electrical']==r, 'Electrical'] = len(rank)-i
        
combo['Electrical'].unique()

array([5, 3, 4, 2, 1], dtype=object)

# 'Functional'

In [21]:
# change to ranked feature
rank = ['Typ', 'Min1', 'Min2','Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']

for i, r in enumerate(rank):
    combo.loc[combo['Functional']==r, 'Functional'] = len(rank)-(i+1)
        
combo[['Functional']].head()

Unnamed: 0,Functional
0,7
1,7
2,7
3,7
4,7


# 'GarageFinish'

In [22]:
# change to ranked feature
rank = ['Fin', 'RFn', 'Unf', 'None']

for i, r in enumerate(rank):
    combo.loc[combo['GarageFinish']==r, 'GarageFinish'] = len(rank)-(i+1)
        
combo[['GarageFinish']].head()

Unnamed: 0,GarageFinish
0,2
1,2
2,2
3,1
4,2


# Editing Values

In [23]:
# Changing MSSubClass for ease of use (and possible feature engineering with other columns)
mssc_dict = {20: '1-STORY 1946 & NEWER ALL STYLES',
             30: '1-STORY 1945 & OLDER',
             40: '1-STORY W/FINISHED ATTIC ALL AGES',
             45: '1-1/2 STORY - UNFINISHED ALL AGES',
             50: '1-1/2 STORY FINISHED ALL AGES',
             60: '2-STORY 1946 & NEWER',
             70: '2-STORY 1945 & OLDER',
             75: '2-1/2 STORY ALL AGES',
             80: 'SPLIT OR MULTI-LEVEL',
             85: 'SPLIT FOYER',
             90: 'DUPLEX - ALL STYLES AND AGES',
             120: '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
             150: '1-1/2 STORY PUD - ALL AGES',
             160: '2-STORY PUD - 1946 & NEWER',
             180: 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
             190: '2 FAMILY CONVERSION - ALL STYLES AND AGES'}


for k in mssc_dict:
    combo.loc[combo['MSSubClass']==k, 'MSSubClass'] = mssc_dict[k]
    
combo[['MSSubClass']].head()

Unnamed: 0,MSSubClass
0,2-STORY 1946 & NEWER
1,1-STORY 1946 & NEWER ALL STYLES
2,2-STORY 1946 & NEWER
3,2-STORY 1945 & OLDER
4,2-STORY 1946 & NEWER


In [24]:
# Changing typo in MS Zoning
combo.loc[combo['MSZoning']=='C (all)', 'MSZoning'] = 'C'

combo['MSZoning'].unique()

array(['RL', 'RM', 'C', 'FV', 'RH'], dtype=object)

In [25]:
# correcting spelling error
combo.loc[combo['BldgType']=='Twnhs', 'BldgType'] = 'TwnhsI'

combo['BldgType'].unique()

array(['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'TwnhsI'], dtype=object)

# Separating and saving data

In [26]:
# converting columns containing numbers to numeric type
for col in combo.columns: 
    combo[col] = combo[col].astype('float', errors='ignore')
    
combo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2917 entries, 0 to 2916
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2917 non-null   object 
 1   MSZoning       2917 non-null   object 
 2   LotFrontage    2917 non-null   float64
 3   LotArea        2917 non-null   float64
 4   Street         2917 non-null   float64
 5   Alley          2917 non-null   float64
 6   LotShape       2917 non-null   object 
 7   LandContour    2917 non-null   object 
 8   Utilities      2917 non-null   float64
 9   LotConfig      2917 non-null   object 
 10  LandSlope      2917 non-null   object 
 11  Neighborhood   2917 non-null   object 
 12  Condition1     2917 non-null   object 
 13  Condition2     2917 non-null   object 
 14  BldgType       2917 non-null   object 
 15  HouseStyle     2917 non-null   object 
 16  OverallQual    2917 non-null   float64
 17  OverallCond    2917 non-null   float64
 18  YearBuil

In [27]:
import copy

# get the reduced dataset of what 
combo_red = copy.deepcopy(combo[intx_tokeep])


hp_train, hp_train_red = combo.loc[hp_index], combo_red.loc[hp_index]
hp_test, hp_test_red = combo.loc[1458:], combo_red.loc[1458:]

# saving the imputed datasets
hp_train.to_csv('..\hp_2a_ranked_edited_train.csv')
hp_test.to_csv('..\hp_2a_ranked_edited_test.csv')

hp_train_red.to_csv('..\hp_2a_ranked_edited_train_red.csv')
hp_test_red.to_csv('..\hp_2a_ranked_edited_test_red.csv')

In [28]:
# dummify the variables
hp_dum = pd.get_dummies(combo, drop_first=True)
hp_dum_red = pd.get_dummies(combo_red, drop_first=True)

hp_list = [hp_dum, hp_dum_red]

# one hot encoding for mssubclass since it is numeric
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

for feat in ['MoSold', 'YrSold']:
    for i, df in enumerate(hp_list):
        if feat in df.columns:
            # encode the feature
            enc.fit(df[[feat]])
            # create array and then dataframe of the array
            oh_labels = enc.transform(df[[feat]]).toarray()
            mssubcl_dum = pd.DataFrame(oh_labels)
            mssubcl_dum.columns = enc.get_feature_names([feat])
            # Concatenate the dataframes and drop Id and original MSSubClass
            df = pd.concat([df, mssubcl_dum], axis=1, sort=False)
            df = df.drop([feat], axis=1)
            
        if i == 0:
            hp_train_dum = df.loc[hp_index]
            hp_test_dum = df.loc[1458:]  
        else:
            hp_train_dum_red = df.loc[hp_index]
            hp_test_dum_red = df.loc[1458:]            

# saving the imputed AND dummified datasets
hp_train_dum.to_csv('..\hp_2a_ranked_edited_dum_train.csv')
hp_test_dum.to_csv('..\hp_2a_ranked_edited_dum_test.csv')

# saving the imputed AND dummified datasets
hp_train_dum_red.to_csv('..\hp_2a_ranked_edited_dum_train_red.csv')
hp_test_dum_red.to_csv('..\hp_2a_ranked_edited_dum_test_red.csv')

combo_dum = pd.DataFrame(np.concatenate([hp_train_dum, hp_test_dum]))
combo_dum.columns = hp_train_dum.columns

combo_dum_red = pd.DataFrame(np.concatenate([hp_train_dum_red, hp_test_dum_red]))
combo_dum_red.columns = hp_train_dum_red.columns

In [29]:
# scaling the dataset
from sklearn.preprocessing import MinMaxScaler

red_list = [combo_dum, combo_dum_red]

for i, df in enumerate(red_list):
    df_cols = list(df.columns)
    df_full = copy.deepcopy(df)

    for col in df_cols:
        df_full[col] = MinMaxScaler().fit_transform(np.array(df_full[col]).reshape(-1,1))
    
    if i == 0:
        hp_train_full = df_full.loc[hp_index]
        hp_test_full = df_full.loc[1458:]  
    else:
        hp_train_full_red = df_full.loc[hp_index]
        hp_test_full_red = df_full.loc[1458:]

# saving the imputed AND dummified AND scaled datasets
hp_train_full.to_csv('..\hp_2a_ranked_edited_dum_full_train.csv')
hp_test_full.to_csv('..\hp_2a_ranked_edited_dum_full_test.csv')

hp_train_full_red.to_csv('..\hp_2a_ranked_edited_dum_full_train_red.csv')
hp_test_full_red.to_csv('..\hp_2a_ranked_edited_dum_full_test_red.csv')