In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as ss
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm
from scipy import stats
import numpy as np

In [2]:
#dropping SalePrice column for now while we process predictors
rawdf = pd.read_csv('train.csv')
# rawdf = pd.read_csv('test.csv')
# raw_features = rawdf.copy()
rawdf2 = rawdf.drop(rawdf[rawdf['TotalBsmtSF'] > 6000].index).reset_index(drop=True)
rawdf3 = rawdf2.drop(rawdf2[rawdf2['GrLivArea'] > 4600].index).reset_index(drop=True)
prices = rawdf3['SalePrice']
raw_features = rawdf3.drop(["SalePrice"], axis=1).reset_index(drop=True)

In [103]:
#Listing the columns to be dropped, will happen later
dropcols = ['Neighborhood_SawyerW', 'LotFrontage', 'Condition1', 'BldgType', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GrLivArea', 'TotalBsmtSF', 'MSSubClass', 'MasVnrArea', 'Id', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition2', 'OverallCond', 'YearRemodAdd', 'RoofMatl', 'Exterior2nd', 'ExterCond', 'BsmtCond', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'KitchenAbvGr', 'Functional', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Veenker', 'Neighborhood_ClearCr', 'Neighborhood_Gilbert', 'Neighborhood_NWAmes', 'Foundation_Wood', 'Foundation_Stone', 'Neighborhood_Blueste', 'GarageType_Basment', 'Neighborhood_NPkVill', 'Neighborhood_Mitchel', 'Neighborhood_SWISU', 'GarageType_CarPort', 'Neighborhood_BrDale']

In [4]:
#Setting the rules for our OHE
#setting up a dummy maker function
Dummymaker = OneHotEncoder( categories = 'auto', drop='first', sparse = False )
def DummyMake(dfx, colname):
    df = dfx.copy()
    Dums = Dummymaker.fit_transform(df[[colname]])
    Dums = pd.DataFrame(Dums, columns=Dummymaker.get_feature_names([colname]))
    df = pd.concat( (df.drop([colname], axis=1).reset_index(drop = True), Dums), axis = 1 )
    return df

In [5]:
#Setting up some of the imputers we'll use for missing values, one that replaces them with 0 (numerical features)
#and ones that replaces them with the most common value (categorical features)
imp0 = SimpleImputer(strategy='constant', fill_value = 0)
impNone = SimpleImputer(strategy = 'constant', fill_value = 'none')
impTyp = SimpleImputer(strategy = 'most_frequent')
impMean = SimpleImputer(strategy = 'mean')

In [6]:
def AdjustedSF(row):
    if row['TotalBsmtSF'] == 0 :
        return row['GrLivArea']
    return row['TotalBsmtSF']

In [7]:
def GarageType(row):
    if row['GarageType'] == 'BuiltIn' :
        return 'Attchd'
    return row['GarageType']

In [8]:
def KitchenQual(row):
    if row['KitchenQual'] == 'Ex' :
        return 'Gd'
    return row['KitchenQual']

In [9]:
def BsmtQual(row):
    if row['BsmtQual'] == 'Ex' :
        return 'Gd'
    return row['BsmtQual']

In [10]:
def TotalBsmtSFImpute(dfx):
    df = dfx.copy()
    df['TotalBsmtSF'] = imp0.fit_transform(df[['TotalBsmtSF']])
    return df  

In [11]:
def MSZoningBool(dfx):
    df = dfx.copy()
    df['MSZoning'] = df['MSZoning'].apply(lambda x: 1 if x == 'RL' else 0 )
    return df

In [12]:
def LotFrontageImpute(dfx):
    df = dfx.copy()
    df['LotFrontage'] = imp0.fit_transform(df[['LotFrontage']])
    return df
    

In [13]:
def LotShapeBool(dfx):
    df = dfx.copy()
    df['LotShape'] = df['LotShape'].apply(lambda x: 1 if x == 'Reg' else 0 )
    return df

In [14]:
def NeighborhoodDums(dfx):
    return DummyMake(dfx, 'Neighborhood')

In [15]:
def Condition1Bool(dfx):
    df = dfx.copy()
    df['Condition1'] = df['Condition1'].apply(lambda x: 1 if x == 'Norm' else 0 )
    return df

In [16]:
def BldgTypeBool(dfx):
    df = dfx.copy()
    df['BldgType'] = df['BldgType'].apply(lambda x: 1 if x == '1Fam' else 0 )
    return df

In [17]:
def HouseStyleBool(dfx):
    df = dfx.copy()
    df['HouseStyle'] = df['HouseStyle'].apply(lambda x: 1 if x == '2Story' else 0 )
    return df

In [18]:
def RoofStyleBool(dfx):
    df = dfx.copy()
    df['RoofStyle'] = df['RoofStyle'].apply(lambda x: 1 if x == 'Gable' else 0 )
    return df

In [19]:
def Exterior1stBool(dfx):
    df = dfx.copy()
    df['Exterior1st'] = df['Exterior1st'].apply(lambda x: 1 if x == 'VinylSd' else 0 )
    return df

In [20]:
def MasVnrTypeBool(dfx):
    df = dfx.copy()
    df['MasVnrType'] = impTyp.fit_transform(df[['MasVnrType']])
    df['MasVnrType'] = df['MasVnrType'].apply(lambda x: 0 if x == 'None' else 1 )
    return df

In [21]:
def ExterQualBool(dfx):
    df = dfx.copy()
    df['ExterQual'] = df['ExterQual'].apply(lambda x: 1 if x == 'TA' else 0 )
    return df

In [22]:
def FoundationDums(dfx):
    return DummyMake(dfx, 'Foundation')

In [23]:
def BsmtQualDums(dfx):
    df = dfx.copy()
    df['BsmtQual'] = impTyp.fit_transform(df[['BsmtQual']])
    return DummyMake(df, 'BsmtQual')

In [24]:
def BsmtExposureBool(dfx):
    df = dfx.copy()
    df['BsmtExposure'] = impTyp.fit_transform(df[['BsmtExposure']])
    df['BsmtExposure'] = df['BsmtExposure'].apply(lambda x: 0 if x == 'No' else 1 )
    return df

In [25]:
def TotalBsmtSFImpute(dfx):
    df = dfx.copy()
    df['TotalBsmtSF'] = imp0.fit_transform(df[['TotalBsmtSF']])
    return df

In [26]:
def HeatingQCBool(dfx):
    df = dfx.copy()
    df['HeatingQC'] = df['HeatingQC'].apply(lambda x: 1 if x == 'Ex' else 0)
    return df

In [27]:
def CentralAirBool(dfx):
    df = dfx.copy()
    df['CentralAir'] = df['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0 )
    return df

In [28]:
def KitchenQualDums(dfx):
    df = dfx.copy()
    df['KitchenQual'] = impTyp.fit_transform(df[['KitchenQual']])
    return DummyMake(df, 'KitchenQual')

In [29]:
def FireplacesBool(dfx):
    df = dfx.copy()
    df['Fireplaces'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0 )
    return df

In [30]:
def GarageTypeDums(dfx):
    df = dfx.copy()
    df['GarageType'] = impNone.fit_transform(df[['GarageType']])
    return DummyMake(df, 'GarageType')

In [31]:
def GarageCarsImpute(dfx):
    df = dfx.copy()
    df['GarageCars'] = imp0.fit_transform(df[['GarageCars']])
    return df

In [32]:
def BathroomsCalc(dfx):
    df = dfx.copy()
    df['BsmtFullBath'] = imp0.fit_transform(df[['BsmtFullBath']])
    df['Bathrooms'] = df['BsmtFullBath'] + df['FullBath'] + df['HalfBath']
    return df

In [33]:
def DropCols(dfx):
    df = dfx.copy()
    for col in dropcols:
        df = df.drop([col], axis=1).reset_index(drop = True)
    return df

In [34]:
def SSfeatures(dfx):
    ss = StandardScaler()
    df = dfx.copy()
    return pd.DataFrame(ss.fit_transform( df ), columns = df.columns)

## Run this to reset the data frame after each CV

In [407]:
#dropping SalePrice column for now while we process predictors
rawdf = pd.read_csv('train.csv')
rawdf2 = rawdf.drop(rawdf[rawdf['TotalBsmtSF'] > 6000].index).reset_index(drop=True)
rawdf3 = rawdf2.drop(rawdf2[rawdf2['GrLivArea'] > 4600].index).reset_index(drop=True)
prices = rawdf3['SalePrice']
raw_features = rawdf3.drop(["SalePrice"], axis=1).reset_index(drop=True)

## Put The Variable To Drop As Kdrop, then rerun dropcols

In [401]:
Kdrop = 'Bathrooms'

In [408]:
dropcols = ['ExterQual', 'YearBuilt', 'GarageType', 'Exterior1st', 'BsmtQual', 'RoofStyle', 'MSZoning', 'MasVnrType', 'LotFrontage', 'Condition1', 'BldgType', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GrLivArea', 'TotalBsmtSF', 'MSSubClass', 'MasVnrArea', 'Id', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition2', 'OverallCond', 'YearRemodAdd', 'RoofMatl', 'Exterior2nd', 'ExterCond', 'BsmtCond', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'KitchenAbvGr', 'Functional', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']

## Look through this and comment out the function that pertains to the kdrop variable IF APPLICABLE

In [409]:
def TABSPreProcess(dfx):
    df = dfx.copy()
    df['AdjustedSF'] = df.apply(lambda row: AdjustedSF(row), axis = 1)
#     df['GarageType'] = df.apply(lambda row: GarageType(row), axis = 1)
    df['KitchenQual'] = df.apply(lambda row: KitchenQual(row), axis = 1)
#     df['BsmtQual'] = df.apply(lambda row: BsmtQual(row), axis = 1)
#     df = MSZoningBool(df)
    df = LotFrontageImpute(df)
    df = LotShapeBool(df)
    df = NeighborhoodDums(df)
    df = Condition1Bool(df)
    df = BldgTypeBool(df)
    df = HouseStyleBool(df)
#     df = RoofStyleBool(df)
#     df = Exterior1stBool(df)
#     df = MasVnrTypeBool(df)
    df = ExterQualBool(df)
    df = FoundationDums(df)
#     df = BsmtQualDums(df)
    df = BsmtExposureBool(df)
    df = TotalBsmtSFImpute(df)
    df = HeatingQCBool(df)
    df = CentralAirBool(df)
    df = KitchenQualDums(df)
    df = FireplacesBool(df)
#     df = GarageTypeDums(df)
    df = GarageCarsImpute(df)
    df = BathroomsCalc(df)
    df = DropCols(df)
    df = SSfeatures(df)
    return df  

## RUN THESE AFTER EACH PREPROCESS:

In [410]:
processed_features = TABSPreProcess(raw_features)

In [411]:
processed_df = processed_features.copy()

In [412]:
processed_df['AdjustedSF'] = impMean.fit_transform(processed_df[['AdjustedSF']])

In [413]:
processed_df['SalePrice'] = rawdf3['SalePrice'].apply(lambda x: math.log(x))

## CV Process is Below:

In [414]:
from sklearn.model_selection import cross_val_score
X = processed_features
y = processed_df['SalePrice']
ridge = Ridge()
scores = cross_val_score(ridge, X, y, cv=5)
print(scores)
print(np.mean(scores))

[0.87178322 0.85896312 0.86015127 0.87392542 0.88242833]
0.8694502726441964


86945