In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train_raw = pd.read_csv("data/train_raw.csv").drop("Id", axis=1)
test_raw = pd.read_csv("data/test_raw.csv").drop("Id", axis=1)

# display(train_raw)
print(train_raw.info())
# print(train.head())
# print(train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

**Alley**  
(92 non null)  
Add new category "Missing" where Alley is not available. Add boolean feature with True where Alley is available and False where None, so that the model learns the availability.

In [3]:
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def fill_missing_values(df, non_complete_features):
    filtered_df = df
    for feature in non_complete_features:
        if is_numeric_dtype(df[feature]):
            filtered_df[feature] = filtered_df[feature].fillna(0)
        else:
            filtered_df[feature] = filtered_df[feature].fillna("missing")
        filtered_df[f"{feature}Available"] = filtered_df[feature].notna().astype(int)
    filtered_df = filtered_df.reset_index()
    return filtered_df
    
def onehotencode_categorical_features(train, test):
    train_y = train["SalePrice"]
    train_X = train.drop("SalePrice", axis=1)
    
    categorical_features = train_X.select_dtypes(include=['object']).columns
    numerical_features = train_X.select_dtypes(include=['int64', 'float64']).columns
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'  # Keeps numerical columns as-is
    )

    # Fit the preprocessor on the training data
    preprocessor.fit(train_X)

    # Transform both training and test data
    encoded_train = preprocessor.transform(train_X)
    encoded_test = preprocessor.transform(test)

    # Get feature names for the transformed data
    encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

    # Convert back to DataFrame for easier interpretation
    final_columns = list(encoded_feature_names).extend(numerical_features)
    encoded_train_df = pd.DataFrame(encoded_train, columns=final_columns)
    encoded_test_df = pd.DataFrame(encoded_test, columns=final_columns)
    
    encoded_train_df["SalePrice"] = train_y

    return encoded_train_df, encoded_test_df

One hot encode the categorical features

In [4]:
non_complete_features = ["Electrical", "LotFrontage", "Alley", "MasVnrType", "MasVnrArea", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]

train = fill_missing_values(train_raw, non_complete_features)
test = fill_missing_values(test_raw, non_complete_features)

train, test = onehotencode_categorical_features(train, test)

print(train.shape, test.shape)

train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)


(1460, 324) (1459, 323)
