# Housing Prices Competition

By: Tony Zheng

## Imports/Packages

In [2]:
import pandas as pd
import sklearn as skl
import xgboost

## Reading/Pre-processing

In [3]:
data = pd.read_csv("train.csv", index_col="Id")
y = data["SalePrice"]
X = data.drop("SalePrice", axis=1)
n_train = X.shape[0]

### Filling NA

Let's see the amount of NA entries in each col

In [4]:
missing_cols = []
per_na = X.isna().sum(axis=0) / n_train * 100

for col in per_na.index:
    if per_na[col] != 0: 
        missing_cols.append(col)
        print(f"{col:<15} {X[col].dtype} {"":<15} {round(per_na[col], 1)}%")

LotFrontage     float64                 17.7%
Alley           object                 93.8%
MasVnrType      object                 59.7%
MasVnrArea      float64                 0.5%
BsmtQual        object                 2.5%
BsmtCond        object                 2.5%
BsmtExposure    object                 2.6%
BsmtFinType1    object                 2.5%
BsmtFinType2    object                 2.6%
Electrical      object                 0.1%
FireplaceQu     object                 47.3%
GarageType      object                 5.5%
GarageYrBlt     float64                 5.5%
GarageFinish    object                 5.5%
GarageQual      object                 5.5%
GarageCond      object                 5.5%
PoolQC          object                 99.5%
Fence           object                 80.8%
MiscFeature     object                 96.3%


Let's start with the float columns missing stuff.

LotFrontage and MasVnrArea both are NA presumably if the house does not have those things. In those cases, we'll just impute 0.

For GarageYrBlt, however, this is a bit tougher; we can't really encode in a float the garage's year if it doesn't exist. We'll just impute the knn for the column instead. Since there's a column already indicating if this value was missing, that's all we'll to do.


In [5]:
zero_cols = ["LotFrontage", "MasVnrArea"]
knn_cols = ["GarageYrBlt"]

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

fill_na = ColumnTransformer([
    ("Mean", KNNImputer(n_neighbors=7), knn_cols),
    ("Zero", SimpleImputer(strategy="constant", fill_value=0), zero_cols)
], remainder="passthrough", verbose_feature_names_out=False)

### Feature Engineering / Standardizing

#### Object Columns
We'll one hot where there are few enough categories, and ordinal encode when there are more.

In [7]:
ordinal_cols = []
oh_cols = []
max_unique = 15

for col in X.columns:
    values = X[col]

    if values.dtype == object:
        if len(values.unique()) > max_unique:
            ordinal_cols.append(col)
        
        else:
            oh_cols.append(col)

print(ordinal_cols)
print(oh_cols)

['Neighborhood', 'Exterior2nd']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


#### Numerical Columns

Some of the numerical categories don't actually have any numerical meaning; we'll distinguish these categories by the number of unique values among them. Let's see the number of unique values for each column.

In [8]:
numerical_cols = []

for col in X.columns:
    values = X[col]
    if values.dtype != object:
        numerical_cols.append(col)

print(numerical_cols)

for column in numerical_cols:
    print(f"{column.ljust(10)} \t {len(X[column].unique())}")

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
MSSubClass 	 15
LotFrontage 	 111
LotArea    	 1073
OverallQual 	 10
OverallCond 	 9
YearBuilt  	 112
YearRemodAdd 	 61
MasVnrArea 	 328
BsmtFinSF1 	 637
BsmtFinSF2 	 144
BsmtUnfSF  	 780
TotalBsmtSF 	 721
1stFlrSF   	 753
2ndFlrSF   	 417
LowQualFinSF 	 24
GrLivArea  	 861
BsmtFullBath 	 4
BsmtHalfBath 	 3
FullBath   	 4
HalfBath   	 3
BedroomAbvGr 	 8
KitchenAbvGr 	 4
TotRmsAbvGrd 	 12
Fireplaces 	 4
GarageYrBlt 	 98
GarageCars 	 5
GarageArea 	 441
WoodDeckSF 	 274
OpenPorchSF 	 202
EnclosedP

By the looks of it, 10 seems to be a fairly reasonable threshold for categorical vs. numerical. Let's adjust for that...

In [9]:
oh_cols = oh_cols
standard_cols = []
thresh = 10

for col in numerical_cols:
    values = X[col]

    if len(values.unique()) > thresh: standard_cols.append(col)
    else: oh_cols.append(col)

print(standard_cols)
print(oh_cols)

['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 

#### Creating the pipeline...

Since some columns overlap with each other when being transformed, we'll transform the column strings to be indices instead.

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

transform = ColumnTransformer([
    ("One Hot", OneHotEncoder(
            handle_unknown="infrequent_if_exist", sparse_output=False
        ), oh_cols),
    ("Standardize", StandardScaler(), standard_cols),
    ("Ordinal", OrdinalEncoder(
            handle_unknown="use_encoded_value", unknown_value=-1
        ), ordinal_cols)
], remainder="passthrough", verbose_feature_names_out=False)

fill_na.set_output(transform="pandas")
transform.set_output(transform="pandas")

process = Pipeline([
    ("Fill NA", fill_na),
    ("transform", transform)
])

process

In [36]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

from sklearn.base import clone
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

for max_depth in range(6, 12):
    m = clone(process)
    m.append(
        ("transform", transform)
    )
    
    process.fit(X_train, y_train)
    error = mean_absolute_error(y_true = y_valid, y_pred = process.predict(X_valid))
    

17404.585348886987