In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats


## Load Data

In [2]:
X= pd.read_csv("train_clean.csv", index_col = 'Id')
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,log_SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,NoAlley,Reg,Lvl,AllPub,Inside,...,NoPool,NoFence,,0,2,2008,WD,Normal,208500,12.247694
2,20,RL,80.0,9600,Pave,NoAlley,Reg,Lvl,AllPub,FR2,...,NoPool,NoFence,,0,5,2007,WD,Normal,181500,12.109011
3,60,RL,68.0,11250,Pave,NoAlley,IR1,Lvl,AllPub,Inside,...,NoPool,NoFence,,0,9,2008,WD,Normal,223500,12.317167
4,70,RL,60.0,9550,Pave,NoAlley,IR1,Lvl,AllPub,Corner,...,NoPool,NoFence,,0,2,2006,WD,Abnorml,140000,11.849398
5,60,RL,84.0,14260,Pave,NoAlley,IR1,Lvl,AllPub,FR2,...,NoPool,NoFence,,0,12,2008,WD,Normal,250000,12.429216


In [3]:
y = X['SalePrice']

## Feature Engineering



### Creating New Variables

totalSqFeet = TotalBsmtSF + 1stFlrSF + 2ndFlrSF

totalBath = FullBath + BsmtFullBath + 0.5(HalfBath + BsmtHalfBath)


In [4]:
X['totalSqFeet'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']

X['totalBathroom'] = X.FullBath + X.BsmtFullBath + 0.5 * (X.HalfBath + X.BsmtHalfBath)


#### Label Encoding 

Ordinal Variables are label encoded



In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoding_cols = [
    "Alley", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "BsmtQual", "ExterCond", "ExterQual", "FireplaceQu", "Functional",
    "GarageCond", "GarageQual", "HeatingQC", "KitchenQual", "LandSlope",
    "LotShape", "PavedDrive", "PoolQC", "Street", "Utilities"
]

label_encoder = LabelEncoder()

for col in label_encoding_cols:
    X[col] = label_encoder.fit_transform(X[col])


#### One-hot Encoding

Dummy variables were made for nominal variables

In [6]:
X = pd.get_dummies(X, drop_first=True)

#### Outliers

Models like Lasso and Elastic Net are very sensitive to outliers. We can use [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html) to transform numerical data.

In [7]:
from sklearn.preprocessing import RobustScaler
numerical_cols = list(X.select_dtypes(exclude=['object']).columns)
scaler = RobustScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

## Modeling

We are looking a model that minimizes RSME.

Our target variable will be Log_SalePrice. Skewed targets often lead to biased predictions at certain values.

Furthermore, we will use cross-validation(CV) as a means to avoid overfitting our training data.

#### Splitting Data

#### RSME Function

In [8]:
from sklearn.model_selection import KFold, cross_val_score
n_folds = 5

def getRSME(model):
    kf = KFold(n_folds, shuffle = True, random_state = 0)
    
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y, scoring="neg_mean_squared_error", cv=kf))

    return rmse.mean()

### Regularized Regression

In [9]:
from sklearn.linear_model import Ridge, Lasso

#### Ridge Regression