In [1]:
import pandas as pd
import numpy as np

# import classes for imputation
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# import classes for modelling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [12]:
# load dataset with all the variables

data = pd.read_csv("train.csv")

data.head(3)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [13]:
# find categorical variables,
# those of type 'Object' in the dataset
features_categorical = [c for c in data.columns if data[c].dtypes == "O"]

# find numerical variables,
# those different from object and
# also excluding the target SalePrice
features_numerical = [
    c for c in data.columns if data[c].dtypes != "O" and c != "SalePrice"
]


In [8]:
# inspect the categorical variables

data[features_categorical].head()

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,yes,no,no,no,yes,yes,furnished
1,yes,no,no,no,yes,no,furnished
2,yes,no,yes,no,no,yes,semi-furnished
3,yes,no,yes,no,yes,yes,furnished
4,yes,yes,yes,no,yes,no,furnished


In [14]:
data[features_numerical].head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [15]:
# separate intro train and test set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # just the features
    data["SalePrice"],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0,
)  # for reproducibility

X_train.shape, X_test.shape

((1022, 80), (438, 80))

In [16]:
# We create the preprocessing pipelines for both
# numerical and categorical data

# adapted from Scikit-learn code available here under BSD3 license:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", numeric_transformer, features_numerical),
        ("categorical", categorical_transformer, features_categorical),
    ]
)

# Note that to initialise the pipeline I pass any argument to the transformers.
# Those will be changed during the gridsearch below.

In [17]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Lasso(max_iter=2000))]
)

In [18]:
# now we create the grid with all the parameters that we would like to test

param_grid = {
    "preprocessor__numerical__imputer__strategy": ["mean", "median"],
    "preprocessor__categorical__imputer__strategy": ["most_frequent", "constant"],
    "regressor__alpha": [10, 100, 200],
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="r2")

# cv=3 is the cross-validation
# no_jobs =-1 indicates to use all available cpus
# scoring='r2' indicates to evaluate using the r squared

# for more details in the grid parameters visit:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html


In [19]:
# and now we train over all the possible combinations of the parameters above
grid_search.fit(X_train, y_train)

# and we print the best score over the train set
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_train, y_train)
    )
)

best linear regression from grid search: 0.933


In [20]:
# we can print the best estimator parameters like this
grid_search.best_estimator_

In [21]:
# and find the best fit parameters like this
grid_search.best_params_

{'preprocessor__categorical__imputer__strategy': 'constant',
 'preprocessor__numerical__imputer__strategy': 'mean',
 'regressor__alpha': 100}

In [22]:
# here we can see all the combinations evaluated during the grid search
grid_search.cv_results_["params"]


[{'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'constant',
  'preprocessor__numerical__

In [23]:
# and here the scores for each of one of the above combinations
grid_search.cv_results_["mean_test_score"]


array([0.84705347, 0.86572577, 0.86538734, 0.84700725, 0.86569462,
       0.86535737, 0.84857915, 0.86673792, 0.86536049, 0.84852923,
       0.86670218, 0.86530628])

In [24]:
# and finally let's check the performance over the test set
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best linear regression from grid search: 0.738
