# House Prices-Advanced Regression Techniques (1st attempt)

![](photo1.png)

**Goal: predict the sales price for each house**

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
from scipy.stats import norm, skew
from scipy import stats
from scipy.special import boxcox1p


In [2]:
data_name = 'test'
data = pd.read_csv('data/' + data_name + '.csv')

In [3]:
if data_name == 'test':
    data['SalePrice'] = 1

In [4]:
data.shape

(1459, 81)

## Features engineering

In [5]:
data.columns
# data.index

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
data["SalePrice"] = np.log1p(data["SalePrice"])

### Relationships between variables

Before we analyze the relationships between SalePrice and variables, let's take a look at our variables.
Is there a missing value?

**Delete Missing Data**

In [7]:
# total = data.isnull().sum().sort_values(ascending=False)
# percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


- **Lot Frontage:** In this case, missing data doesn't mean that the property doesn't have a LotFrontage. Since the area of each street connected to the house property, most likely it has a similar area to other houses in its neighborhood. Therefore, we can fill in missing values by the median LotFrontage of the neighborhood

In [8]:
data["LotFrontage"] = data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# Adding total sqfootage feature 
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

In [9]:
# cols = list((missing_data[missing_data['Total'] <= 0.9]).index)
# data = data[cols].copy()
# print(data.isnull().sum().max())

In [10]:
cols = ['RoofMatl', 'Exterior1st', 'RoofStyle', 'ExterQual',
        'Exterior2nd', 'YearBuilt', 'ExterCond', 'Foundation',
        'YearRemodAdd', 'SalePrice', 'OverallCond', 'OverallQual',
        'HouseStyle', 'BldgType', 'Condition2', 'Condition1', 
        'Neighborhood', 'LandSlope', 'LotConfig', 'Utilities', 
        'LandContour', 'LotShape', 'Street', 'LotArea', 'MSZoning', 
        'BsmtFinSF1', 'HeatingQC', 'BsmtFinSF2', 'EnclosedPorch', 
        'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 
        'OpenPorchSF', '3SsnPorch', 'BsmtUnfSF', 'ScreenPorch', 'PoolArea', 
        'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'Functional', 'TotRmsAbvGrd',
        'KitchenQual', 'KitchenAbvGr', 'BedroomAbvGr', 'HalfBath', 'FullBath', 
        'BsmtHalfBath', 'BsmtFullBath', 'GrLivArea', 'LowQualFinSF', '2ndFlrSF', 
        '1stFlrSF', 'CentralAir', 'SaleCondition', 'Heating', 'TotalBsmtSF', 'MSSubClass']
data = data[cols].copy()


In [11]:
# # dealing with missing data
# train = train.drop((missing_data[missing_data['Percent'] > 0.9]).index,axis = 1)
# # train.loc[:,(missing_data[missing_data['Total'] <= 0.9]).index]
# train.isnull().sum().max() #Checking that there's no missing data missing

### Data Cleaning

**Outliers**

In [12]:
#Deleting outliers
condition1 = data['GrLivArea']>4000
condition2 = data['SalePrice']<200000

data = data.drop(data[condition1 & condition2].index)


In [13]:
#Deleting outliers
# 위 처럼 읽기 좋게 코드를 작성하면 좋음.

data = data.drop(data[(data['TotalBsmtSF'] > 4000) & (data['TotalBsmtSF']<300000)].index)


### Skewness

In [14]:
numeric_feats = data.dtypes[data.dtypes != "object"].index

In [15]:
numeric_feats

Index(['YearBuilt', 'YearRemodAdd', 'SalePrice', 'OverallCond', 'OverallQual',
       'LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'EnclosedPorch', 'Fireplaces',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', '3SsnPorch',
       'BsmtUnfSF', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
       'TotRmsAbvGrd', 'KitchenAbvGr', 'BedroomAbvGr', 'HalfBath', 'FullBath',
       'BsmtHalfBath', 'BsmtFullBath', 'GrLivArea', 'LowQualFinSF', '2ndFlrSF',
       '1stFlrSF', 'TotalBsmtSF', 'MSSubClass'],
      dtype='object')

In [16]:
# Check the skew of all numerical features
skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

In [17]:
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    data[feat] = boxcox1p(data[feat], lam)

In [18]:
data = pd.get_dummies(data)
print(data.shape)

(1458, 202)


In [19]:
data.to_csv('data/mart_for_' + data_name + '.csv', index = False)

**Reference**

- [Stacked Regressions : Top 4% on LeaderBoard](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard) by Serigne