In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os.path import join as pjoin

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, make_scorer, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Analysis of data

In [2]:
TARGET_VARIABLE = 'SalePrice'

In [3]:
df_train = pd.read_csv(pjoin('data', 'train.csv'))
df_test = pd.read_csv(pjoin('data', 'test.csv'))

df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# print columns taht have NaNs
nan_number = df_train.isna().sum()
nan_number[nan_number > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
df_train.corr()[TARGET_VARIABLE]

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [6]:
df_train['Functional'].value_counts()

Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: Functional, dtype: int64

## Feature selection and augmentation

In [7]:
NUMERIC_FEATURES = ['LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageCars', 'GarageArea', 'OpenPorchSF', 'EnclosedPorch']
CATEGORICAL_FEATURES = ['SaleCondition', 'GarageType', 'Functional']

In [8]:
X, y = df_train[NUMERIC_FEATURES + CATEGORICAL_FEATURES], df_train[TARGET_VARIABLE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((1168, 12), (292, 12))

In [9]:
def get_model():
    """
    Get linear regression model with additional preprocessing:
    - missing values imputation
    - categorical_features one-hot encoding
    - numerical features scaling
    """
    numeric_preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ])
    categorical_preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    model = Pipeline([
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('cat', categorical_preprocessor, CATEGORICAL_FEATURES), 
                ('num', numeric_preprocessor, NUMERIC_FEATURES)
            ])
        ),
        
        ('regressor', Ridge())
    ])
    return model

## Algorithm selection

In [10]:
# grid search params
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1.0, 5.0, 10.0],   
}

In [11]:
model = get_model()
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(r2_score),
    n_jobs=-1,
    verbose=0
)
grid_search.fit(
    X=X_train,
    y=y_train,
)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['SaleCondition',
                                                                          'GarageType',
                                                                          'Functiona

In [12]:
print(f'Best cross-validation R2-score: {grid_search.best_score_}')

Best cross-validation R2-score: 0.7057910563477536


In [13]:
# retrain model with best parameters on full training dataset
model = get_model()
model.fit(X_train, y_train);

## Results evaluation

In [14]:
def evaluate(model, X, y, dataset_name='train'):
    print(f'Evaluating on {dataset_name} dataset:')
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f'MSE: {mse}')
    print(f'MAE: {mae}') 
    print(f'RMSE: {np.sqrt(mse)}')   
    print(f'R2-score: {r2}')       

In [15]:
evaluate(model, X_train, y_train, dataset_name='train')

Evaluating on train dataset:
MSE: 1568187112.606322
MAE: 24753.186819684524
RMSE: 39600.34232940824
R2-score: 0.7370820912293707


In [16]:
evaluate(model, X_test, y_test, dataset_name='test')

Evaluating on test dataset:
MSE: 1711776080.789996
MAE: 26861.752549979465
RMSE: 41373.61575678389
R2-score: 0.7768313844089114
