In [1]:
import os
import pandas as pd
#import matplotlib.pyplot as plt
#%matplotlib inline
#import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = os.path.join('data')

In [95]:
def load_training_data(training_path=DATA_PATH):
    cvs_path = os.path.join(training_path, 'train.csv')
    return pd.read_csv(cvs_path)

In [96]:
def load_predicting_data(predicting_path=DATA_PATH):
    cvs_path = os.path.join(predicting_path, 'test.csv')
    return pd.read_csv(cvs_path)

In [132]:
training = load_training_data()

In [6]:
#pred = load_predicting_data()

In [6]:
training.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [133]:
train_set, val_set = train_test_split(training, test_size=0.3, random_state=42)

### Baseline model with few features

In [8]:
train_base = train_set.loc[:, ['LotFrontage', 'LotArea', 'GrLivArea', 'OverallQual', 'OverallCond']]
val_base = val_set.loc[:, ['LotFrontage', 'LotArea', 'GrLivArea', 'OverallQual', 'OverallCond']]

In [9]:
y_base = train_set.SalePrice
y_val = val_set.SalePrice

In [10]:
train_base.head()

Unnamed: 0,LotFrontage,LotArea,GrLivArea,OverallQual,OverallCond
135,80.0,10400,1682,7,6
1452,35.0,3675,1072,5,5
762,72.0,8640,1547,7,5
932,84.0,11670,1905,9,5
435,43.0,10667,1661,7,6


In [22]:
y_base.shape

(1022,)

### Build a pipeline

In [12]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
#from sklearn.compose import ColumnTransformer

#### numerical attributes preprocessor

In [13]:
num_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value=None),
                    StandardScaler()
)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin = LinearRegression()

In [15]:
pipe_lin = make_pipeline(num_transformer, lin)

In [16]:
pipe_lin.fit(train_base, y_base);

In [17]:
predictions = pipe_lin.predict(val_base)
lin_mse = mean_squared_error(y_val, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

41446.14026812556

### Add a few more features

In [134]:
train_set.loc[:,'HouseAge'] = train_set.YrSold - train_set.YearBuilt
val_set.loc[:,'HouseAge'] = val_set.YrSold - val_set.YearBuilt

train_set.loc[:,'TotalPorchSF'] = train_set.OpenPorchSF + train_set.EnclosedPorch + train_set.ScreenPorch + train_set['3SsnPorch']
val_set.loc[:,'TotalPorchSF'] = val_set.OpenPorchSF + val_set.EnclosedPorch + val_set.ScreenPorch + val_set['3SsnPorch']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [135]:
train_base = train_set.loc[:, ['LotFrontage', 'LotArea', 'GrLivArea', 'OverallQual', 'OverallCond', 
                               'Neighborhood', 'BldgType', 'HouseAge', 'TotalPorchSF', 'BsmtExposure']]
val_base = val_set.loc[:, ['LotFrontage', 'LotArea', 'GrLivArea', 'OverallQual', 'OverallCond', 
                           'Neighborhood', 'BldgType', 'HouseAge', 'TotalPorchSF', 'BsmtExposure']]

In [136]:
y_base = train_set.SalePrice
y_val = val_set.SalePrice

In [137]:
train_base.head()

Unnamed: 0,LotFrontage,LotArea,GrLivArea,OverallQual,OverallCond,Neighborhood,BldgType,HouseAge,TotalPorchSF,BsmtExposure
135,80.0,10400,1682,7,6,NWAmes,1Fam,38,0,No
1452,35.0,3675,1072,5,5,Edwards,TwnhsE,1,28,Gd
762,72.0,8640,1547,7,5,Somerst,1Fam,1,45,Mn
932,84.0,11670,1905,9,5,Somerst,1Fam,1,191,No
435,43.0,10667,1661,7,6,CollgCr,1Fam,13,61,Av


In [103]:
num_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value=None),
                    StandardScaler()
)

cat_transformer = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='missing'),
                    OneHotEncoder(handle_unknown='ignore')
)

In [138]:
col_transformer = make_column_transformer(
    (cat_transformer, ['Neighborhood', 'BldgType', 'BsmtExposure']),
    (num_transformer, ['LotFrontage', 'LotArea', 'GrLivArea', 'OverallQual', 'OverallCond', 
                       'HouseAge', 'TotalPorchSF']),
    remainder='passthrough'
)

In [139]:
pipe_lin = make_pipeline(col_transformer, lin)

In [140]:
pipe_lin.fit(train_base, y_base);

In [141]:
predictions = pipe_lin.predict(val_base)
lin_mse = mean_squared_error(y_val, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

31742.795912240825

In [69]:
train_base.head()

Unnamed: 0,LotFrontage,LotArea,GrLivArea,OverallQual,OverallCond,Neighborhood,BldgType
135,80.0,10400,1682,7,6,NWAmes,1Fam
1452,35.0,3675,1072,5,5,Edwards,TwnhsE
762,72.0,8640,1547,7,5,Somerst,1Fam
932,84.0,11670,1905,9,5,Somerst,1Fam
435,43.0,10667,1661,7,6,CollgCr,1Fam


In [6]:
def EDA_df(df):
    lotcon = {'CulDSac': 0, 'Inside': 1, 'Corner': 2, 'FR2': 2, 'FR3': 3}
    df.loc[:,'LotConfig'] = df.loc[:,'LotConfig'].map(lotcon)
    
    bdtype = {'1Fam': 0, 'Duplex': 1, 'TwnhsE': 1, '2fmCon': 2, 'Twnhs': 3}
    df.loc[:,'BldgType'] = df.BldgType.map(bdtype)
    
    Roof = {'Gable': 1,'Flat': 0, 'Gambrel': 0, 'Hip': 0, 'Mansard': 0, 'Shed': 0}
    df.loc[:,'RoofStyle'] = df.RoofStyle.map(Roof)
    
    df.loc[:, 'LotFrontage'].fillna(0, inplace=True)
    #df.loc[:, 'LotFrontage'].fillna(0)
    df.loc[:, 'MasVnrArea'].fillna(0, inplace=True)
    df.loc[:, 'MasVnrType'].fillna('None', inplace=True)
    
    Masonry = {'None': 0, 'BrkFace': 1, 'Stone': 1, 'BrkCmn': 1} 
    df.loc[:,'MasVnrType'] = df.MasVnrType.map(Masonry)
    
    df.loc[:, 'Fence'].fillna('None', inplace=True)
    fencing = {'None': 0, 'MnPrv': 1, 'GdPrv': 1, 'GdWo': 1, 'MnWw': 1} 
    df.loc[:,'Fence'] = df.Fence.map(fencing)
    
    AC = {'Y': 1, 'N': 0} 
    df.loc[:,'CentralAir'] = df.CentralAir.map(AC)
    
    df.loc[:, 'Alley'].fillna('None', inplace=True)
    Ay = {'None': 0, 'Grvl': 1, 'Pave': 1} 
    df.loc[:,'Alley'] = df.Alley.map(Ay)
    
    land = {'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0} 
    df.loc[:,'LandContour'] = df.LandContour.map(land)
    
    fd = {'PConc': 1, 'CBlock': 0, 'BrkTil': 0, 'Slab': 0, 'Stone': 0, 'Wood': 0} 
    df.loc[:,'Foundation'] = df.Foundation.map(fd)
    
    df.loc[(df.Fireplaces>0), 'Fireplaces'] = 1
    
    lot = {'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0} 
    df.loc[:,'LotShape'] = df.LotShape.map(lot)
    
    drive = {'Y': 1, 'N': 0, 'P': 0} 
    df.loc[:,'PavedDrive'] = df.PavedDrive.map(drive)
    
    rank = {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4, 'None': 0} 
    df.loc[:,'GarageCond'].fillna('None', inplace=True)
    df.loc[:,'GarageCond'] = df.GarageCond.map(rank)
    
    df.loc[:,'GarageQual'].fillna('None', inplace=True)
    df.loc[:,'GarageQual'] = df.GarageQual.map(rank)
    
    df.loc[:,'FireplaceQu'].fillna('None', inplace=True)
    df.loc[:,'FireplaceQu'] = df.FireplaceQu.map(rank)
    
    df.loc[:,'BsmtQual'].fillna('None', inplace=True)
    df.loc[:,'BsmtQual'] = df.BsmtQual.map(rank)
    
    df.loc[:,'BsmtCond'].fillna('None', inplace=True)
    df.loc[:,'BsmtCond'] = df.BsmtCond.map(rank)
    
    df.loc[:,'ExterQual'] = df.ExterQual.map(rank)
    df.loc[:,'KitchenQual'] = df.KitchenQual.map(rank)
    
    rank2 = {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4} 
    df.loc[:,'HeatingQC'] = df.HeatingQC.map(rank2)
    df.loc[:,'ExterCond'] = df.ExterCond.map(rank2)
    
    condition = {'RRNe': 0, 'RRNn': 0, 'RRAe': 0, 'RRAn': 0,
            'Artery': 1, 'Feedr': 1, 'Norm': 2, 'PosN': 2, 'PosA': 2} 
    #0, railroad; 1, high traffic; 2, low traffic
    df.loc[:,'Condition1'] = df.Condition1.map(condition)
    
    df.loc[:, 'BsmtExposure'].fillna('None', inplace=True)
    bsmtExp = {'None': 0, 'No': 1, 'Mn': 1, 'Av': 2, 'Gd': 3} 
    df.loc[:,'BsmtExposure'] = df.BsmtExposure.map(bsmtExp)
    
    #neighbor = {'Blmngtn': 0, 'Blueste': 0, 'BrDale': 0, 'BrkSide': 3, 'ClearCr': 0,
    #       'CollgCr': 2, 'Crawfor': 2, 'Edwards': 1, 'Gilbert': 0, 'IDOTRR': 3,
    #       'MeadowV': 2, 'Mitchel': 2, 'NAmes': 0, 'NoRidge': 0, 'NPkVill': 0, 
    #        'NridgHt': 0, 'NWAmes': 0, 'OldTown': 3, 'SWISU': 2, 'Sawyer': 1,
    #        'SawyerW': 1, 'Somerst': 0, 'StoneBr': 0, 'Timber': 2, 'Veenker': 0
     #      } 
    #df.loc[:,'Neighborhood'] = df.Neighborhood.map(neighbor)

    #df.loc[:, 'GarageType'].fillna('None', inplace=True)
    #gartype = {'None': 0, '2Types': 1, 'CarPort': 1, 'Basment': 1,
    #        'BuiltIn': 1, 'Detchd': 2, 'Attchd': 3} 
    #df.loc[:,'GarageType'] = df.GarageType.map(gartype)
    
    df.loc[(df.OverallQual<=2), 'OverallQual'] = 0
    df.loc[(df.OverallQual>2) & (df.OverallQual<=4), 'OverallQual'] = 1
    df.loc[(df.OverallQual>4) & (df.OverallQual<=6), 'OverallQual'] = 2
    df.loc[(df.OverallQual>6) & (df.OverallQual<=8), 'OverallQual'] = 3
    df.loc[(df.OverallQual>=9), 'OverallQual'] = 4
    
    df.loc[(df.OverallCond<=2), 'OverallCond'] = 0
    df.loc[(df.OverallCond>2) & (df.OverallCond<=4), 'OverallCond'] = 1
    df.loc[(df.OverallCond>4) & (df.OverallCond<=6), 'OverallCond'] = 2
    df.loc[(df.OverallCond>6) & (df.OverallCond<=8), 'OverallCond'] = 3
    df.loc[(df.OverallCond>=9), 'OverallCond'] = 4
    
    ## new features
    #df.loc[:,'HouseAge'] = df.YrSold - df.YearBuilt [improved]
    df.loc[:,'GarageAge'] = df.YrSold - df.GarageYrBlt
    df.loc[:,'RecentRemodel'] = df.YrSold - df.YearRemodAdd
    #df.loc[:,'TotalPorchSF'] = df.OpenPorchSF + df.EnclosedPorch + df.ScreenPorch + df['3SsnPorch'] improved
    df.loc[:,'TotalBathAbGr'] = df.FullBath + df.HalfBath / 2
    df.loc[:,'TotalFinSF'] = df.BsmtFinSF1 + df.BsmtFinSF2 + df['1stFlrSF'] + df['2ndFlrSF']
    df.loc[:,'BsmtBath'] = df.BsmtFullBath + df.BsmtHalfBath

    df.loc[(df.BsmtBath>0), 'BsmtBath'] = 1
    
    #drop_attribs = ['Utilities', 'Heating', 'RoofMatl', 'SaleCondition', 'Electrical', 'SaleType',
    #            'PoolQC', 'MiscFeature', 'Street', 'Functional', 'MSSubClass', 'LandSlope', 
    #            'HouseStyle', 'YearBuilt', 'GarageYrBlt', 'MoSold', 'MiscVal', 'FullBath', 
    #            'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'LowQualFinSF', 'PoolArea', 'YrSold',
    #            'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    #            'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', '3SsnPorch', 'GarageCars', 'KitchenAbvGr',
    #            'BedroomAbvGr', 'Exterior1st', 'Exterior2nd', 'Condition2', 'BsmtFinType1', 'BsmtFinType2',
    #           'GarageFinish', 'MSZoning', 'OverallCond', 'ExterCond', 'LandContour', 'GarageAge', 'LotConfig']
   # df_out = df.loc[:, ~df.columns.isin(drop_attribs)]
    return df_out

In [30]:
#from sklearn.preprocessing import OrdinalEncoder
#ordinal_encoder = OrdinalEncoder()

In [37]:
#train_lotconf = train_set[['LotConfig']]
#lotconfigureation = ordinal_encoder.fit_transform(train_lotconf)
#lotconfigureation

In [38]:
ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),
                                 ['numerical_column']),
                                ('onehotencoder', OneHotEncoder(...),
                                 ['categorical_column'])])

In [46]:
num_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'HouseAge',
           'RecentRemodel', 'TotalPorchSF', 'TotalBathAbGr', 'TotalFinSF']

In [53]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
        ( StandardScaler(), num_cols ), remainder="passthrough"
        #( OneHotEncoder(), ['categorical_column'] )
)


#scaling = StandardScaler()

In [7]:
train_cleaned = EDA_df(train_set)

In [60]:
train_cleaned_transformed = train_cleaned.copy()

In [61]:
train_cleaned_transformed[num_cols] = preprocessor.fit_transform(train_cleaned[num_cols])

In [63]:
train_cleaned_transformed.shape

(1022, 40)

In [9]:
test_cleaned = EDA_df(test_set)

In [17]:
pred_cleaned = EDA_df(pred)

In [18]:
pred_cleaned.shape

(1459, 39)

In [19]:
pred_cleaned.head()

Unnamed: 0,Id,LotFrontage,LotArea,Alley,LotShape,LotConfig,Neighborhood,Condition1,BldgType,OverallQual,...,GarageCond,PavedDrive,WoodDeckSF,Fence,HouseAge,RecentRemodel,TotalPorchSF,TotalBathAbGr,TotalFinSF,BsmtBath
0,1461,80.0,11622,0,1,1,0,1,0,2,...,2,1,140,1,49,49,120,1.0,1508.0,0.0
1,1462,81.0,14267,0,0,2,0,2,0,2,...,2,1,393,0,52,52,36,1.5,2252.0,0.0
2,1463,74.0,13830,0,0,1,0,2,0,2,...,2,1,212,1,13,12,34,2.5,2420.0,0.0
3,1464,78.0,9978,0,0,1,0,2,0,2,...,2,1,360,0,12,12,36,2.5,2206.0,0.0
4,1465,43.0,5005,0,0,1,0,2,1,3,...,2,1,0,0,18,18,226,2.0,1543.0,0.0


In [13]:
train_cleaned.to_csv('./data/train_cleaned.csv', index=False)
test_cleaned.to_csv('./data/test_cleaned.csv', index=False)

In [29]:
corr_matrix = train_cleaned.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.733066
GrLivArea        0.689238
TotalFinSF       0.678082
ExterQual        0.658612
KitchenQual      0.655688
GarageArea       0.621937
BsmtQual         0.619358
TotalBsmtSF      0.590017
TotalBathAbGr    0.584861
FireplaceQu      0.524848
TotRmsAbvGrd     0.519634
Foundation       0.481575
Fireplaces       0.476331
MasVnrArea       0.453250
HeatingQC        0.440328
BsmtExposure     0.355017
MasVnrType       0.345093
WoodDeckSF       0.331151
GarageQual       0.281610
CentralAir       0.269116
GarageType       0.268161
GarageCond       0.266425
LotArea          0.262896
BsmtCond         0.225136
PavedDrive       0.223190
LotFrontage      0.205541
BsmtBath         0.192954
TotalPorchSF     0.172693
Condition1       0.117960
Id              -0.034812
LotConfig       -0.089459
Alley           -0.134945
Fence           -0.156784
BldgType        -0.167336
RoofStyle       -0.224277
LotShape        -0.250549
Neighborhood    -0.288874
HouseAge    

# Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [None]:
StandardScaler().fit_transform(x) 

# Transformation Pipelines

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer