# Housing Regression

## Prepare the Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# for project imports
import sys
from pathlib import Path

root = Path.cwd().parent
sys.path.append(str(root))

### Read File

In [2]:
from src.data_utils import download_kaggle_competition

files = download_kaggle_competition("home-data-for-ml-course", str(root / Path("data/raw")))
print(files)

[PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/test.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/sample_submission.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/test.csv'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/data_description.txt'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/train.csv'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/train.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/sample_submission.csv')]


In [3]:
train_data_file = "../data/raw/home-data-for-ml-course/train.csv"
train_data = pd.read_csv(train_data_file)

test_data_file = "../data/raw/home-data-for-ml-course/test.csv"
test_data = pd.read_csv(test_data_file)

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Get Train Data

In [24]:
from sklearn.model_selection import train_test_split

train_data = train_data.dropna(axis=0, subset=["SalePrice"])

X_full = train_data.drop(columns=["Id", "SalePrice"])
y_full = train_data["SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, train_size=0.8, test_size=0.2, random_state=42)

X_test = test_data[X_train.columns]

object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
print(object_cols)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [5]:
X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
254,20,70.0,8400,5,6,1957,1957,0.0,922,0,...,294,250,0,0,0,0,0,0,6,2010
1066,60,59.0,7837,6,7,1993,1994,0.0,0,0,...,380,0,40,0,0,0,0,0,5,2009
638,30,67.0,8777,5,7,1910,1950,0.0,0,0,...,0,328,0,164,0,0,0,0,5,2008
799,50,60.0,7200,5,7,1937,1950,252.0,569,0,...,240,0,0,264,0,0,0,0,6,2007
380,50,50.0,5000,5,6,1924,1950,0.0,218,0,...,308,0,0,242,0,0,0,0,5,2010


### Clean the Data

In [15]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 79)
LotFrontage      217
Alley           1094
MasVnrType       683
MasVnrArea         6
BsmtQual          28
BsmtCond          28
BsmtExposure      28
BsmtFinType1      28
BsmtFinType2      28
Electrical         1
FireplaceQu      547
GarageType        64
GarageYrBlt       64
GarageFinish      64
GarageQual        64
GarageCond        64
PoolQC          1162
Fence            935
MiscFeature     1122
dtype: int64


In [None]:
# Drop columns with many missing values
missing_value_threshold = X_train.shape[0] * 0.4
reduced_X_train = X_train.drop(columns=missing_val_count_by_column[missing_val_count_by_column > missing_value_threshold].index)
reduced_X_valid = X_valid.drop(columns=missing_val_count_by_column[missing_val_count_by_column > missing_value_threshold].index)
reduced_X_test = X_test.drop(columns=missing_val_count_by_column[missing_val_count_by_column > missing_value_threshold].index)
reduced_X_full = X_full.drop(columns=missing_val_count_by_column[missing_val_count_by_column > missing_value_threshold].index)

missing_val_after_reducing = (reduced_X_train.isnull().sum())
print(missing_val_after_reducing[missing_val_after_reducing > 0])

LotFrontage     217
MasVnrArea        6
BsmtQual         28
BsmtCond         28
BsmtExposure     28
BsmtFinType1     28
BsmtFinType2     28
Electrical        1
GarageType       64
GarageYrBlt      64
GarageFinish     64
GarageQual       64
GarageCond       64
dtype: int64


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1168.0,951.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,1168.0,...,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,56.849315,70.343849,10689.642123,6.121575,5.58476,1970.965753,1984.89726,103.771945,446.023973,45.152397,...,476.273973,95.946918,49.578767,21.839041,3.8125,15.407534,2.955479,51.267123,6.356164,2007.818493
std,42.531862,24.897021,10759.366198,1.367619,1.116062,30.675495,20.733955,173.032238,459.070977,158.217499,...,211.095373,129.685939,69.43358,62.083227,31.519664,55.881148,41.648504,553.039684,2.670707,1.322639
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7587.25,5.0,5.0,1953.0,1966.0,0.0,0.0,0.0,...,341.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,70.0,9600.0,6.0,5.0,1972.0,1994.0,0.0,384.5,0.0,...,482.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11700.0,7.0,6.0,2001.0,2004.0,166.0,721.0,0.0,...,576.0,168.0,74.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,5644.0,1127.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Separate numeric and categorical columns
numeric_cols = [col for col in reduced_X_train.columns if reduced_X_train[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in reduced_X_train.columns if reduced_X_train[col].dtype == 'object']

# Create separate imputers
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Combine them
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_imputer, numeric_cols),
        ('cat', categorical_imputer, categorical_cols)
    ])

# Fit and transform
preprocessor.fit(reduced_X_train)

# Transform all datasets
imputed_X_train = pd.DataFrame(
    preprocessor.transform(reduced_X_train), 
    columns=numeric_cols + categorical_cols
)
imputed_X_valid = pd.DataFrame(
    preprocessor.transform(reduced_X_valid), 
    columns=numeric_cols + categorical_cols
)
imputed_X_test = pd.DataFrame(
    preprocessor.transform(reduced_X_test), 
    columns=numeric_cols + categorical_cols
)
imputed_X_full = pd.DataFrame(
    preprocessor.transform(reduced_X_full), 
    columns=numeric_cols + categorical_cols
)

for col in numeric_cols:
    imputed_X_train[col] = pd.to_numeric(imputed_X_train[col])
    imputed_X_valid[col] = pd.to_numeric(imputed_X_valid[col])
    imputed_X_test[col] = pd.to_numeric(imputed_X_test[col])
    imputed_X_full[col] = pd.to_numeric(imputed_X_full[col])

Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
Categorical columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCon

### Prepare the Data

In [62]:
# Analyze categorical columns for cardinality and unseen values
imputed_categorical_cols = [col for col in imputed_X_train.columns if imputed_X_train[col].dtype == 'object']

# Check for unseen values in validation and test sets
good_categorical_cols = []
bad_categorical_cols = []

for col in imputed_categorical_cols:
    train_values = set(imputed_X_train[col].values)
    valid_values = set(imputed_X_valid[col].values)
    test_values = set(imputed_X_test[col].values)
    
    # Check if validation or test have unseen values
    unseen_in_valid = valid_values - train_values
    unseen_in_test = test_values - train_values
    
    if len(unseen_in_valid) > 0 or len(unseen_in_test) > 0:
        bad_categorical_cols.append(col)
        print(f"BAD: {col} - unseen values in valid: {unseen_in_valid}, test: {unseen_in_test}")
    else:
        good_categorical_cols.append(col)
        cardinality = len(train_values)
        print(f"GOOD: {col} - cardinality: {cardinality}")

# Split good categorical columns by cardinality
low_cardinality_threshold = 5
low_cardinality_cols = []
high_cardinality_cols = []

for col in good_categorical_cols:
    cardinality = imputed_X_train[col].nunique()
    if cardinality <= low_cardinality_threshold:
        low_cardinality_cols.append(col)
    else:
        high_cardinality_cols.append(col)

print(f"\nSummary:")
print(f"Bad categorical columns (dropped): {len(bad_categorical_cols)}")
print(f"Good low cardinality columns (one-hot): {len(low_cardinality_cols)}")
print(f"Good high cardinality columns (ordinal): {len(high_cardinality_cols)}")

GOOD: MSZoning - cardinality: 5
GOOD: Street - cardinality: 2
GOOD: LotShape - cardinality: 4
GOOD: LandContour - cardinality: 4
GOOD: Utilities - cardinality: 2
GOOD: LotConfig - cardinality: 5
GOOD: LandSlope - cardinality: 3
GOOD: Neighborhood - cardinality: 25
GOOD: Condition1 - cardinality: 9
GOOD: Condition2 - cardinality: 8
GOOD: BldgType - cardinality: 5
GOOD: HouseStyle - cardinality: 8
GOOD: RoofStyle - cardinality: 6
BAD: RoofMatl - unseen values in valid: {'Membran'}, test: set()
GOOD: Exterior1st - cardinality: 15
GOOD: Exterior2nd - cardinality: 16
GOOD: ExterQual - cardinality: 4
GOOD: ExterCond - cardinality: 5
GOOD: Foundation - cardinality: 6
GOOD: BsmtQual - cardinality: 4
GOOD: BsmtCond - cardinality: 4
GOOD: BsmtExposure - cardinality: 4
GOOD: BsmtFinType1 - cardinality: 6
GOOD: BsmtFinType2 - cardinality: 6
GOOD: Heating - cardinality: 6
GOOD: HeatingQC - cardinality: 5
GOOD: CentralAir - cardinality: 2
BAD: Electrical - unseen values in valid: {'Mix'}, test: set(

In [63]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Get numeric columns from imputed data
final_numeric_cols = [col for col in imputed_X_train.columns if imputed_X_train[col].dtype in ['int64', 'float64']]

print(f"Final numeric columns: {len(final_numeric_cols)}")
print(f"Good low cardinality columns (one-hot): {len(low_cardinality_cols)}")
print(f"Good high cardinality columns (ordinal): {len(high_cardinality_cols)}")

# Create encoders
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Create final preprocessor
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', final_numeric_cols),  # Keep numeric as-is
        ('onehot', one_hot_encoder, low_cardinality_cols),
        ('ordinal', ordinal_encoder, high_cardinality_cols)
    ],
    remainder='drop'  # Drop any remaining columns (bad categorical ones)
)

# Fit and transform
final_preprocessor.fit(imputed_X_train)

# Transform all datasets
final_X_train = final_preprocessor.transform(imputed_X_train)
final_X_valid = final_preprocessor.transform(imputed_X_valid)
final_X_test = final_preprocessor.transform(imputed_X_test)
final_X_full = final_preprocessor.transform(imputed_X_full)

print(f"Final feature shape: {final_X_train.shape}")
print(f"Original shape: {imputed_X_train.shape}")

Final numeric columns: 36
Good low cardinality columns (one-hot): 20
Good high cardinality columns (ordinal): 15
Final feature shape: (1168, 109)
Original shape: (1168, 73)


## Fit the Model

### Training the Model

In [64]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)
y_pred = model.predict(final_X_valid)

print(str.format("Predictions: {}", y_pred[:5].round(0)))
print(str.format("Actual:      {}", y_valid.values[:5].round(0)))

Predictions: [140517. 314281. 115604. 156545. 330182.]
Actual:      [154500 325000 115000 159000 315500]


### Measure Model Performance

In [65]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(str.format("RMSE: {}", rmse.round(0)))

RMSE: 28220.0


In [None]:
from sklearn.model_selection import cross_val_score

cross_rmse = np.sqrt(-cross_val_score(model, final_X_full, y_full, scoring="neg_mean_squared_error", cv=10))
pd.Series(cross_rmse).describe()

count       10.000000
mean     28801.654689
std       6359.436503
min      21936.122896
25%      25007.680936
50%      26451.868699
75%      31155.426300
max      41204.428291
dtype: float64

## Save Predictions

In [67]:
# Read test data
test_data_file = "../data/raw/home-data-for-ml-course/test.csv"
test_data = pd.read_csv(test_data_file)

y_test_pred = model.predict(final_X_test)

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": y_test_pred
})
submission.to_csv("../data/processed/house_prices_predictions.csv", index=False)