# Housing Regression

## Data

In [64]:
import numpy as np
import pandas as pd

# for project imports
import sys
from pathlib import Path

root = Path.cwd().parent
sys.path.append(str(root))

### Read Files

In [65]:
from src.data_utils import download_kaggle_competition

files = download_kaggle_competition("home-data-for-ml-course", str(root / Path("data/raw")))
print(files)

[PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/test.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/sample_submission.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/test.csv'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/data_description.txt'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/train.csv'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/train.csv.gz'), PosixPath('/Users/boris.tsarev/MyProjects/ai-explore/data/raw/home-data-for-ml-course/sample_submission.csv')]


In [66]:
train_data_file = "../data/raw/home-data-for-ml-course/train.csv"
train_data = pd.read_csv(train_data_file)

test_data_file = "../data/raw/home-data-for-ml-course/test.csv"
test_data = pd.read_csv(test_data_file)

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Get Train, Validation and Test Data

In [67]:
from sklearn.model_selection import train_test_split

train_data = train_data.dropna(axis=0, subset=["SalePrice"])

X_full = train_data.drop(columns=["Id", "SalePrice"])
y_full = train_data["SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, train_size=0.8, test_size=0.2, random_state=42)
X_test = test_data[X_train.columns]

initial_numeric_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
initial_categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [68]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1066,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2009,WD,Normal
638,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2008,WD,Normal
799,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,MnPrv,,0,6,2007,WD,Normal
380,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,5,2010,WD,Normal


### Remove columns with high missing ratio

In [69]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 79)
LotFrontage      217
Alley           1094
MasVnrType       683
MasVnrArea         6
BsmtQual          28
BsmtCond          28
BsmtExposure      28
BsmtFinType1      28
BsmtFinType2      28
Electrical         1
FireplaceQu      547
GarageType        64
GarageYrBlt       64
GarageFinish      64
GarageQual        64
GarageCond        64
PoolQC          1162
Fence            935
MiscFeature     1122
dtype: int64


In [70]:
from src.transformers import HighMissingDropper

temp_dropper = HighMissingDropper(threshold=0.4)
temp_dropper.fit(X_train)
columns_to_drop = temp_dropper.columns_to_drop_

# Filter out columns that will be dropped
numeric_cols = [col for col in initial_numeric_cols if col not in columns_to_drop]
categorical_cols = [col for col in initial_categorical_cols if col not in columns_to_drop]

## Model

### Prepare Pipeline

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

# Split categorical by cardinality (only for remaining columns)
low_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() <= 5]
high_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() > 5]

# Create the complete pipeline with corrected column lists
complete_pipeline = Pipeline([
    # Step 1: Drop columns with high missing values
    ('drop_missing', HighMissingDropper(threshold=0.4)),
    
    # Step 2: Impute missing values and encode categoricals
    ('preprocess', ColumnTransformer([
        # Numeric columns: impute with mean
        ('numeric', SimpleImputer(strategy='mean'), numeric_cols),
        
        # Low cardinality categorical: impute + one-hot encode
        ('low_cardinality', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('encode', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), low_cardinality_cols),
        
        # High cardinality categorical: impute + ordinal encode
        ('high_cardinality', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), high_cardinality_cols)
    ], remainder='drop')),
    
    # Step 3: Apply model
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

### Fit and Predict

In [61]:
complete_pipeline.fit(X_train, y_train)
y_pred = complete_pipeline.predict(X_valid)



### Measure Performance

In [62]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"RMSE: {rmse:.0f}")

print(f"Predictions: {y_pred[:5].round(0)}")
print(f"Actual:      {y_valid.values[:5].round(0)}")

cross_rmse = np.sqrt(-cross_val_score(complete_pipeline, X_full, y_full, scoring="neg_mean_squared_error", cv=10))
pd.Series(cross_rmse).describe()

RMSE: 28252
Predictions: [141540. 314188. 116238. 157257. 324325.]
Actual:      [154500 325000 115000 159000 315500]




count       10.000000
mean     28792.131156
std       6579.568953
min      21898.062934
25%      24976.811956
50%      26536.695540
75%      31390.298978
max      40556.476467
dtype: float64

## Save Predictions

In [63]:
# Read test data
test_data_file = "../data/raw/home-data-for-ml-course/test.csv"
test_data = pd.read_csv(test_data_file)

y_test_pred = complete_pipeline.predict(X_test)

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": y_test_pred
})
submission.to_csv("../data/processed/house_prices_predictions.csv", index=False)