#### Step 1. Data Preprocessing & Cleaning
✔ Load the dataset\
✔ Identify and handle missing values (true missing vs. "NA" meaning absence)\
✔ Convert incorrect data types (e.g., numeric features stored as objects)\
✔ Handle outliers and anomalies

In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA


In [123]:
train = pd.read_csv("home-data-for-ml-course\\train.csv")
test = pd.read_csv("home-data-for-ml-course\\test.csv")
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

### Categories

In [124]:
#save the id
train_ID = train["Id"]
test_ID  = test["Id"]
y = train.SalePrice

#drop ID
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

test[["LotFrontage", "MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","GarageCars","BsmtHalfBath","GarageYrBlt"]] = test[["LotFrontage", "MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","GarageCars","BsmtHalfBath","GarageYrBlt"]].apply(pd.to_numeric, errors="coerce")
train[["LotFrontage", "MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","GarageCars","BsmtHalfBath","GarageYrBlt"]] = train[["LotFrontage", "MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","GarageCars","BsmtHalfBath","GarageYrBlt"]].apply(pd.to_numeric, errors="coerce")

#group features for preprocessing purpose.
train_categorical = [feature for feature in train.columns if train[feature].dtype == "object"] 
test_categorical = [feature for feature in test.columns if test[feature].dtype == "object"] 

train_nominal = ["MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", 
                    "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating",
                    "CentralAir", 'Electrical',"GarageType", "MiscFeature", "SaleType", "SaleCondition"]
test_nominal = ["MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", 
                    "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating",
                    "CentralAir", 'Electrical',"GarageType", "MiscFeature", "SaleType", "SaleCondition"]

train_ordinal = [ 'LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                    'BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual',
                    'GarageCond','PavedDrive','PoolQC','Fence']
test_ordinal = [ 'LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                    'BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual',
                    'GarageCond','PavedDrive','PoolQC','Fence']

train_numerical = [feature for feature in train.columns if feature not in train_categorical + ['SalePrice']]
test_numerical = [feature for feature in test.columns if feature not in test_categorical]

discrete_numerical_train = [ 'OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MoSold', "MSSubClass"] 
discrete_numerical_test = [ 'OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MoSold', "MSSubClass"] 

continuous_numerical_train = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2',
                                  'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','GarageYrBlt',
                                  'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
                                  'MiscVal','YrSold']

continuous_numerical_test = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2',
                                  'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','GarageYrBlt',
                                  'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
                                  'MiscVal','YrSold']

### KNN Imputer

In [125]:
# from sklearn.impute import KNNImputer

# # Initialize KNN Imputer (default K=5)
# knn_imputer = KNNImputer(n_neighbors=5)

# # Apply to numerical features
# train[train_numerical] = knn_imputer.fit_transform(train[train_numerical])
# test[test_numerical] = knn_imputer.transform(test[test_numerical])

# print("✅ KNN Imputation applied successfully!")

### ML Imputer

In [126]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(max_iter=10, random_state=42)  # Uses ML to predict missing values

train[train_numerical] = iter_imputer.fit_transform(train[train_numerical])
test[test_numerical] = iter_imputer.transform(test[test_numerical])

print("✅ Iterative Imputation applied successfully!")


✅ Iterative Imputation applied successfully!


### Constant Value Imputation

In [127]:
# # Fill Numerical Features with 0
# train[train_numerical] = train[train_numerical].fillna(0)
# test[test_numerical] = test[test_numerical].fillna(0)

# # Fill Categorical Features with "Do_not_have_this_feature"
# train[train_categorical] = train[train_categorical].fillna("Do_not_have_this_feature")
# test[test_categorical] = test[test_categorical].fillna("Do_not_have_this_feature")

# print("✅ Constant Value Imputation applied successfully!")


### Step 2.Exploratory Data Analysis (EDA)
✔ Visualize target variable (SalePrice)\
Identify correlations between features and SalePrice\
Analyze categorical vs. numerical features\
Detect and handle outliers

### Outliers

In [128]:
# Remove outliers based on observations on scatter plots against SalePrice:
train = train.drop(train['LotFrontage']
                                     [train['LotFrontage']>200].index)
train = train.drop(train['LotArea']
                                     [train['LotArea']>100000].index)
train = train.drop(train['BsmtFinSF1']
                                     [train['BsmtFinSF1']>4000].index)
train = train.drop(train['TotalBsmtSF']
                                     [train['TotalBsmtSF']>6000].index)
train = train.drop(train['1stFlrSF']
                                     [train['1stFlrSF']>4000].index)
train = train.drop(train.GrLivArea
                                     [(train['GrLivArea']>4000) & 
                                      (y<300000)].index)
train = train.drop(train.LowQualFinSF
                                     [train['LowQualFinSF']>550].index)

#### Numerical Features **MI**
 **No MI Features** : *3SsnPorch MoSol* \
 **Very Low MI Features** : *MiscVal PoolArea* \
 **Very High MI Features** : *OverallQual GrLivArea TotalBsmtSF	GarageCars  GarageArea	YearBuilt  1stFlrSF	 MSSubClass	FullBath GarageYrBlt*

In [129]:
# y = train.SalePrice
# train_numerical = train.select_dtypes(include=["int64", "float64"]).columns.drop("SalePrice", errors="ignore")
# mutual_df = train[train_numerical]
# mutual_infoN = mutual_info_regression(mutual_df.fillna(0), y, random_state=1)
# mutual_infoN = pd.Series(mutual_infoN)
# mutual_infoN.index = mutual_df.columns

#### Categorical Features **MI**
 **No MI Features** : *RoofMatl Utilities Street PoolQC* \
 **Very Low MI** : *Functional LandSlope	Condition2	MiscFeature* \
 **Very High MI Features** : *Neighborhood ExterQual BsmtQual KitchenQual GarageFinish*

In [130]:
# mutual_df_categorical = train[train_categorical].copy()
# #categorical features must be encoded to get mutual information
# for colname in mutual_df_categorical:
#     mutual_df_categorical[colname], _ = mutual_df_categorical[colname].factorize()
# mutual_infoC = mutual_info_regression(mutual_df_categorical.fillna("Do_not_have_feature"), y, random_state=1)

# mutual_infoC = pd.Series(mutual_infoC)
# mutual_infoC.index = mutual_df_categorical.columns

### Dropping Corr and MI features

In [131]:
# #define threshold using
# mi_thresholdN = 0.01  # Drop bottom 0.01 MI features

# # identify low MI features
# low_mi_featuresN = mutual_infoN[mutual_infoN < mi_thresholdN].index.tolist()

# # drop them from the dataset
# train.drop(columns=low_mi_featuresN, inplace=True)
# test.drop(columns=low_mi_featuresN, inplace=True)



# print(f"Dropped {len(low_mi_featuresN)} numerical features with MI < {mi_thresholdN:.4f}")
# print(f"Remaining TRAIN features: {train.shape[1]}")

In [132]:
# print(f"Dropped {len(low_mi_featuresN)} numerical features with MI < {mi_thresholdN:.4f}")
# print(f"Remaining TEST features: {test.shape[1]}")

In [133]:
# #define threshold
# mi_thresholdC = 0.01

# # identify low MI features
# low_mi_featuresC = mutual_infoC[mutual_infoC < mi_thresholdC].index.tolist()

# # drop them from the dataset
# train.drop(columns=low_mi_featuresC, inplace=True)
# test.drop(columns=low_mi_featuresC, inplace=True)

# print(f"Dropped {len(low_mi_featuresC)} numerical features with MI < {mi_thresholdC:.4f}")
# print(f"Remaining TRAIN features: {train.shape[1]}")

In [134]:
# print(f"Dropped {len(low_mi_featuresC)} numerical features with MI < {mi_thresholdC:.4f}")
# print(f"Remaining TEST features: {test.shape[1]}")

In [135]:
# features_to_drop = ["GarageArea", "TotRmsAbvGrd", "1stFlrSF", "GarageYrBlt", "HeatingQC"]
# train.drop(columns=features_to_drop, inplace=True)

# print(f"Dropped {len(features_to_drop)} highly correlated features from TRAIN: {features_to_drop}")
# train.shape

In [136]:
# test.drop(columns=features_to_drop, inplace=True)

# print(f"Dropped {len(features_to_drop)} highly correlated features from TEST: {features_to_drop}")
# test.shape

### Feature Update

In [137]:
train_categorical = [feature for feature in train.columns if
                    train[feature].dtype == "object"]

train_ordinal = [ 'LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                    'BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual',
                    'GarageCond','PavedDrive','PoolQC','Fence']

train_nominal = list(set(train_categorical) - set(train_ordinal)) 
train_numerical = list(set(train.columns) - set(train_categorical)) 

test_categorical = [feature for feature in test.columns if
                    test[feature].dtype == "object"]

test_ordinal = [ 'LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                    'BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual',
                    'GarageCond','PavedDrive','PoolQC','Fence']

test_nominal = list(set(test_categorical) - set(test_ordinal)) 
test_numerical = list(set(test.columns) - set(test_categorical)) 


### Ordinal Encoding

In [138]:
# Define column mappings
GarageQual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
Fence_map = {'GdPrv': 4,'MnPrv': 3,'GdWo': 2, 'MnWw': 1,'NA': 0}
GarageFinish_map = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0}
KitchenQual_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
GarageCond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
HeatingQC_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
ExterQual_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
BsmtCond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
LandSlope_map = {'Gtl': 2, 'Mod': 1, 'Sev': 0}
ExterCond_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}
BsmtExposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0}
PavedDrive_map = {'Y': 2, 'P': 1, 'N': 0}
BsmtQual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
LotShape_map = {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0}
BsmtFinType2_map = {'GLQ': 6,'ALQ': 5,'BLQ': 4,'Rec': 3,'LwQ': 2,'Unf': 1, 'NA': 0}
BsmtFinType1_map = {'GLQ': 6,'ALQ': 5,'BLQ': 4,'Rec': 3,'LwQ': 2,'Unf': 1, 'NA': 0}
FireplaceQu_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
Utilities_map = {"AllPub":3, "NoSewr":2, "NoSeWa":1,  "ELO":0}
Functional_map = {'Typ': 7,'Min1': 6,'Min2': 5,'Mod': 4,'Maj1': 3,'Maj2': 2, 'Sev': 1 , 'Sal': 0}
PoolQC_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}

In [139]:
# garage_finish_map = {
#     'Fin': 3,   # Finished
#     'RFn': 2,   # Rough Finished
#     'Unf': 1,   # Unfinished
# }

# # Map values, missing values (NaN) remain NaN
# train['GarageFinish'] = train['GarageFinish'].map(garage_finish_map)

# # Replace NaN with the encoded value for "NA"
# train['GarageFinish'].fillna(0, inplace=True)

# # Check the results
# train['GarageFinish']


In [140]:
# ✅ Define ordinal mapping correctly
ordinal_mapping = {
    "GarageQual" : {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    "Fence" : {'GdPrv': 4,'MnPrv': 3,'GdWo': 2, 'MnWw': 1,'NA': 0},
    "GarageFinish" : {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0},
    "KitchenQual": {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    "GarageCond" : {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    "HeatingQC" : {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    "ExterQual" : {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    "BsmtCond": {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    "LandSlope": {'Gtl': 2, 'Mod': 1, 'Sev': 0},
    "ExterCond" : {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
    "BsmtExposure" : {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
    "PavedDrive":  {'Y': 2, 'P': 1, 'N': 0},
    "BsmtQual" : {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    "LotShape" : {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
    "BsmtFinType2" : {'GLQ': 6,'ALQ': 5,'BLQ': 4,'Rec': 3,'LwQ': 2,'Unf': 1, 'NA': 0},
    "BsmtFinType1" : {'GLQ': 6,'ALQ': 5,'BLQ': 4,'Rec': 3,'LwQ': 2,'Unf': 1, 'NA': 0},
    "FireplaceQu" : {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    "Utilities" : {"AllPub":3, "NoSewr":2, "NoSeWa":1,  "ELO":0},
    "Functional" : {'Typ': 7,'Min1': 6,'Min2': 5,'Mod': 4,'Maj1': 3,'Maj2': 2, 'Sev': 1 , 'Sal': 0},
    "PoolQC" : {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
}

# ✅ Extract the ordinal feature names
ordinal_cols = list(ordinal_mapping.keys())

# ✅ Ensure missing values in ordinal features are replaced with "NA" before encoding
for col in ordinal_cols:
    if col in train.columns:
        train[col] = train[col].fillna("NA")
    if col in test.columns:
        test[col] = test[col].fillna("NA")

# ✅ Create an OrdinalEncoder with the correct category order
encoder = OrdinalEncoder(
    categories=[list(ordinal_mapping[col].keys()) for col in ordinal_cols],
    handle_unknown="use_encoded_value",  # Avoid errors for unseen categories
    unknown_value=-1  # Assign unknown categories as -1
)

# ✅ Apply ordinal encoding to both datasets
train[ordinal_cols] = encoder.fit_transform(train[ordinal_cols])
test[ordinal_cols] = encoder.transform(test[ordinal_cols])

print("✅ Ordinal encoding applied successfully!")
train.replace(-1, 0, inplace=True)

✅ Ordinal encoding applied successfully!


### One Hot Encoding

In [141]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Fit on train data, then transform both train & test
train_encoded = encoder.fit_transform(train[train_nominal])
test_encoded = encoder.transform(test[test_nominal])

# Convert back to DataFrame
train_encoded = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(train_nominal), index=train.index)
test_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(test_nominal), index=test.index)

# Drop original categorical features and replace with OHE
train = train.drop(columns=train_nominal).join(train_encoded)
test = test.drop(columns=test_nominal).join(test_encoded)

print("train & Test now have the same OHE features using `OneHotEncoder()`")
print("New shape of TEST dataset:", test.shape)
print("New shape of TRAIN dataset:", train.shape)

train & Test now have the same OHE features using `OneHotEncoder()`
New shape of TEST dataset: (1459, 226)
New shape of TRAIN dataset: (1452, 227)


In [142]:
extra_test_features = set(test.columns) - set(train.columns)
print("🛑 Features in test but NOT in train:", extra_test_features)

🛑 Features in test but NOT in train: set()


In [143]:
# Identify all numerical features after encoding
train_numerical = train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Exclude ordinal features (since they were manually mapped but remain categorical)
train_numerical = [col for col in train_numerical if col not in ordinal_mapping.keys()]
train_numerical = [col for col in train_numerical if "_" not in col]

print("✅ Updated numerical features:\n", train_numerical)


✅ Updated numerical features:
 ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


In [144]:
# Identify all numerical features after encoding
test_numerical = test.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Exclude ordinal features (since they were manually mapped but remain categorical)
test_numerical = [col for col in test_numerical if col not in ordinal_mapping.keys()]
test_numerical = [col for col in test_numerical if "_" not in col]

print("✅ Updated numerical features:\n", test_numerical)

✅ Updated numerical features:
 ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


### Skewness

In [145]:
# # Compute skewness for each numerical feature
# skewness = train[train_numerical].apply(lambda x: x.skew()).sort_values(ascending=False)

# # Set threshold for skewness
# skew_threshold = 0.75

# # Get highly skewed features
# highly_skewed = skewness[(skewness > skew_threshold) & (skewness.index != "SalePrice")]
# print(f"🔍 Found {len(highly_skewed)} highly skewed features:\n", highly_skewed)

# # Apply Yeo-Johnson transformation
# pt = PowerTransformer(method="yeo-johnson")
# train[highly_skewed.index] = pt.fit_transform(train[highly_skewed.index])

# print("✅ Yeo-Johnson transformation applied successfully!")


In [146]:
# # Check skewness after transformation
# new_skewness = train[highly_skewed.index].apply(lambda x: x.skew())
# print("📉 Skewness after transformation:\n", new_skewness)


### Scaling

In [147]:
scaler = StandardScaler()

numerical_features = [col for col in train_numerical if col != "SalePrice"]

train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[test_numerical] = scaler.fit_transform(test[test_numerical])

print("standardization applied.")

standardization applied.


In [148]:
# train_categorical = [col for col in train_categorical if col in train.columns]
# test_categorical = [col for col in test_categorical if col in test.columns]

# # Strip spaces from categorical columns
# train[train_categorical] = train[train_categorical].apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# test[test_categorical] = test[test_categorical].apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# # Convert categorical columns (One-Hot Encoded) to integer type
# train[train_categorical] = train[train_categorical].fillna(0).astype(int)
# test[test_categorical] = test[test_categorical].fillna(0).astype(int)

# print("fixed trailing spaces & converted categorical features to integers successfully!")


In [149]:
test.shape

(1459, 226)

In [150]:
train.shape

(1452, 227)

### PCA 

In [151]:
train = train.fillna(0)
test = test.fillna(0)

#extract target before applying PCA
y_train = train["SalePrice"]  
X_train = train.drop(columns=["SalePrice"])  
X_test = test.copy()  

#apply PCA (Retain 98% variance)
pca = PCA(n_components=0.98)  
X_train_pca = pca.fit_transform(X_train)  
X_test_pca = pca.transform(X_test)  

#convert PCA output back to DataFrame
pca_columns = [f"PCA_{i+1}" for i in range(X_train_pca.shape[1])]
X_train_pca = pd.DataFrame(X_train_pca, index=X_train.index, columns=pca_columns)
X_test_pca = pd.DataFrame(X_test_pca, index=X_test.index, columns=pca_columns)

#reattach `SalePrice` to `train_pca`
X_train_pca["SalePrice"] = y_train.values  

# #save processed datasets
# X_train_pca.to_csv("train_pca.csv", index=False)
# X_test_pca.to_csv("test_pca.csv", index=False)
train = X_train_pca
test = X_test_pca
print(f"✅ PCA completed! New train shape: {X_train_pca.shape}, test shape: {X_test_pca.shape}")

✅ PCA completed! New train shape: (1452, 76), test shape: (1459, 75)


### Exporting 

In [152]:
# train.insert(0, "Id", train_ID)
# test.insert(0, "Id", test_ID)
# train.columns = train.columns.str.strip()
# test.columns = test.columns.str.strip()
# # #save processed datasets
# train.to_csv("train_simp_NODROP.csv", index=False)
# test.to_csv("test_simp_NODROP.csv", index=False)

In [153]:
train.columns

Index(['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8',
       'PCA_9', 'PCA_10', 'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15',
       'PCA_16', 'PCA_17', 'PCA_18', 'PCA_19', 'PCA_20', 'PCA_21', 'PCA_22',
       'PCA_23', 'PCA_24', 'PCA_25', 'PCA_26', 'PCA_27', 'PCA_28', 'PCA_29',
       'PCA_30', 'PCA_31', 'PCA_32', 'PCA_33', 'PCA_34', 'PCA_35', 'PCA_36',
       'PCA_37', 'PCA_38', 'PCA_39', 'PCA_40', 'PCA_41', 'PCA_42', 'PCA_43',
       'PCA_44', 'PCA_45', 'PCA_46', 'PCA_47', 'PCA_48', 'PCA_49', 'PCA_50',
       'PCA_51', 'PCA_52', 'PCA_53', 'PCA_54', 'PCA_55', 'PCA_56', 'PCA_57',
       'PCA_58', 'PCA_59', 'PCA_60', 'PCA_61', 'PCA_62', 'PCA_63', 'PCA_64',
       'PCA_65', 'PCA_66', 'PCA_67', 'PCA_68', 'PCA_69', 'PCA_70', 'PCA_71',
       'PCA_72', 'PCA_73', 'PCA_74', 'PCA_75', 'SalePrice'],
      dtype='object')

In [154]:
test.columns

Index(['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8',
       'PCA_9', 'PCA_10', 'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15',
       'PCA_16', 'PCA_17', 'PCA_18', 'PCA_19', 'PCA_20', 'PCA_21', 'PCA_22',
       'PCA_23', 'PCA_24', 'PCA_25', 'PCA_26', 'PCA_27', 'PCA_28', 'PCA_29',
       'PCA_30', 'PCA_31', 'PCA_32', 'PCA_33', 'PCA_34', 'PCA_35', 'PCA_36',
       'PCA_37', 'PCA_38', 'PCA_39', 'PCA_40', 'PCA_41', 'PCA_42', 'PCA_43',
       'PCA_44', 'PCA_45', 'PCA_46', 'PCA_47', 'PCA_48', 'PCA_49', 'PCA_50',
       'PCA_51', 'PCA_52', 'PCA_53', 'PCA_54', 'PCA_55', 'PCA_56', 'PCA_57',
       'PCA_58', 'PCA_59', 'PCA_60', 'PCA_61', 'PCA_62', 'PCA_63', 'PCA_64',
       'PCA_65', 'PCA_66', 'PCA_67', 'PCA_68', 'PCA_69', 'PCA_70', 'PCA_71',
       'PCA_72', 'PCA_73', 'PCA_74', 'PCA_75'],
      dtype='object')

In [155]:
train.isnull().sum().sum()

0