# Housing Prices Regression Analysis

### Kaggle Competition: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
### I used logistic regressions for my analysis with this dataset as I still need work with my more advanced machine learning techniques

## Data Handling

In [47]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [48]:
X = pd.read_csv('house_train.csv')
X_test = pd.read_csv('house_test.csv')

y = X['SalePrice'].reset_index(drop=True)
train_features = X.drop(['SalePrice'], axis=1)
features = pd.concat([train_features, X_test]).reset_index(drop=True)

features.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
dtype: int64

In [49]:
features.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
                  ...   
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object


In [50]:
for column in features:

    # populating with 0
    if column in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'TotalBsmtSF',
                  'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
                  'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea']:
        features[column] = features[column].fillna(0)

    # populate with 'None'
    if column in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', "PoolQC", 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                  'BsmtFinType2', 'Neighborhood', 'BldgType', 'HouseStyle', 'MasVnrType', 'FireplaceQu', 'Fence', 'MiscFeature']:
        features[column] = features[column].fillna('None')

    # populate with most frequent value for categorical data
    if column in ['Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'RoofStyle',
                  'Electrical', 'Functional', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'RoofMatl', 'ExterQual', 'ExterCond',
                  'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']:
        features[column] = features[column].fillna(features[column].mode()[0])

In [51]:
features.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [52]:
features['total_yrs'] = features['YearRemodAdd'] - features['YearBuilt']  
features['total_sqrft'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['total_bath'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])

In [53]:
features['pool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['2nd_flr'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['garage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['bsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['fireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

#handling the Nulls not taken care of by the feature engineering
features['MSSubClass'] = features['MSSubClass'].apply(str)
features["MSSubClass"] = features["MSSubClass"].fillna("Unknown")

features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

features['LotArea'] = features['LotArea'].astype(np.int64)

features['Alley'] = features['Alley'].fillna('Pave')

features['MasVnrArea'] = features['MasVnrArea'].astype(np.int64)

In [54]:
features.shape
features.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,total_yrs,total_sqrft,total_sqr_footage,total_bath,total_porch_sf,pool,2nd_flr,garage,bsmt,fireplace
0,1,60,RL,65.0,8450,Pave,Pave,Reg,Lvl,AllPub,...,0,2566.0,2416.0,3.5,61,0,1,1,1,0
1,2,20,RL,80.0,9600,Pave,Pave,Reg,Lvl,AllPub,...,0,2524.0,2240.0,2.5,298,0,0,1,1,1
2,3,60,RL,68.0,11250,Pave,Pave,IR1,Lvl,AllPub,...,1,2706.0,2272.0,3.5,42,0,1,1,1,1
3,4,70,RL,60.0,9550,Pave,Pave,IR1,Lvl,AllPub,...,55,2473.0,1933.0,2.0,307,0,1,1,1,1
4,5,60,RL,84.0,14260,Pave,Pave,IR1,Lvl,AllPub,...,0,3343.0,2853.0,3.5,276,0,1,1,1,1


In [55]:
features.isnull().sum().sort_values(ascending = False).head(20)

fireplace       0
RoofMatl        0
Exterior2nd     0
MasVnrType      0
MasVnrArea      0
ExterQual       0
ExterCond       0
Foundation      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinSF1      0
BsmtFinType2    0
BsmtFinSF2      0
BsmtUnfSF       0
TotalBsmtSF     0
Heating         0
HeatingQC       0
CentralAir      0
dtype: int64

In [56]:
#one hot encoding to make it easier and faster for the model
features_2 = pd.get_dummies(features).reset_index(drop=True)

#go back to X and X_test so we have our train and test split, use length of y to help seperate the two back
X = features_2.iloc[:len(y), :]
X_test = features_2.iloc[len(X):, :]
print('Dimensions for each df')
print('X', X.shape, 'y', y.shape, 'X_test', X_test.shape)

Dimensions for each df
X (1460, 327) y (1460,) X_test (1459, 327)


In [66]:
X.dtypes

Id                         int64
LotFrontage              float64
LotArea                    int64
OverallQual                int64
OverallCond                int64
YearBuilt                  int64
YearRemodAdd               int64
MasVnrArea                 int64
BsmtFinSF1               float64
BsmtFinSF2               float64
BsmtUnfSF                float64
TotalBsmtSF              float64
1stFlrSF                   int64
2ndFlrSF                   int64
LowQualFinSF               int64
GrLivArea                  int64
BsmtFullBath             float64
BsmtHalfBath             float64
FullBath                   int64
HalfBath                   int64
BedroomAbvGr               int64
KitchenAbvGr               int64
TotRmsAbvGrd               int64
Fireplaces                 int64
GarageYrBlt              float64
GarageCars               float64
GarageArea               float64
WoodDeckSF                 int64
OpenPorchSF                int64
EnclosedPorch              int64
          

## Time for Model Building and Fitting

In [57]:
#import models
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

def rmsle(y_actual, y_pred):
    return np.sqrt(mean_squared_error(y_actual, y_pred))

In [58]:
kfolds = KFold(n_splits = 10, random_state = 42, shuffle = True)
alphas_no = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_no, cv=kfolds))

In [75]:
ridge_model = ridge.fit(X,y)
x = ridge_model.predict(X)

In [78]:
print('RMSLE score on train data:')
print(rmsle(y, x))

RMSLE score on train data:
25396.652163486902
