In [1]:
import json
from datetime import datetime
from random import randint

import pandas as pd

from prince import MCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

In [2]:
# Load the data with the modified data types
with open("../raw_data/dtypes.json", "r") as file:
    dtypes = json.load(file)
df_train = pd.read_csv("../raw_data/exp_train.csv", dtype=dtypes)
df_test = pd.read_csv("../raw_data/exp_test.csv", dtype=dtypes)
# df_train.info()

In [3]:
# Store the random seed in a log file then generate the kfold splits
rs_log = open("rs_log.txt", "a")
rs_log.write(f"\n{datetime.now()}, rs = {randint(0,1000)}")
rs_log.close()
with open("rs_log.txt", "r") as rs_log:
    rs = int(rs_log.readlines()[-1].split(" ")[-1])

# # For a specific random seed
# rs = 393

n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=rs)
kfold_splits = kfold.split(df_train)

# Name the splits
dfs_tt, dfs_ho = [], []
for train_index, test_index in kfold_splits:
    dfs_tt.append(df_train.iloc[train_index])
    dfs_ho.append(df_train.iloc[test_index])

In [4]:
# Impute `GarageYrBlt`
for i in range(n_splits):
    yr_impute = SimpleImputer(strategy="constant", fill_value=0)
    dfs_tt[i].loc[:, "GarageYrBlt"] = pd.DataFrame(
        yr_impute.fit_transform(dfs_tt[i][["GarageYrBlt"]])
    )
    dfs_ho[i].loc[:, "GarageYrBlt"] = pd.DataFrame(
        yr_impute.transform(dfs_ho[i][["GarageYrBlt"]])
    )

In [5]:
# Preprocessing pipelines for the numerical features
num_med_cols = [
    "LotFrontage",
    "LotArea",
    "OverallQual",
    "OverallCond",
    "YearBuilt",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "BsmtFullBath",
    "FullBath",
    "HalfBath",
    "BedroomAbvGr",
    "TotRmsAbvGrd",
    "Fireplaces",
    "GarageCars",
    "GarageArea",
    "YrSold",
]
num_mode_cols = [
    "LandSlope",
    "LotShape",
    "ExterQual",
    "ExterCond",
    "BsmtQual",
    "HeatingQC",
    "KitchenQual",
    "GarageFinish",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Functional",
    "FireplaceQu",
    "GarageQual",
    "GarageCond",
    "PavedDrive",
    "Fence",
    "PoolQC",  # (ordinals)
    "YearRemodAdd",
    "MasVnrArea",
    "BsmtFinSF1",
    "BsmtFinSF2",
    "BsmtUnfSF",
    "2ndFlrSF",
    "LowQualFinSF",
    "BsmtHalfBath",
    "KitchenAbvGr",
    "GarageYrBlt",
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "3SsnPorch",
    "ScreenPorch",
    "PoolArea",
    "MiscVal",
]
len_num_cols = len(num_med_cols) + len(num_mode_cols)

pipe_num_med = Pipeline([("impute", SimpleImputer(strategy="median"))])  # ,
# IterativeImputer(estimator=BayesianRidge(), initial_strategy='median'))])
#  ('range', StandardScaler())])#MinMaxScaler())])

pipe_num_mode = Pipeline(
    [("impute", KNNImputer())]
)  # SimpleImputer(strategy='most_frequent'))])#,
# IterativeImputer(estimator=BayesianRidge(), initial_strategy='most_frequent'))])
#  ('range', StandardScaler())])#MinMaxScaler())])

In [6]:
cat_cols = (
    df_train.select_dtypes(include="object").drop(columns=["Id"]).columns.to_list()
)



# OHE first to get the max number of components


ohe = OneHotEncoder(handle_unknown="ignore")


new_cat_cols = []


for i in range(n_splits):

    id_tt = dfs_tt[i]["Id"]

    id_ho = dfs_ho[i]["Id"]

    ohe_transformed_tt = pd.DataFrame(ohe.fit_transform(dfs_tt[i][cat_cols]).toarray())

    ohe_transformed_ho = pd.DataFrame(ohe.transform(dfs_ho[i][cat_cols]).toarray())

    prices_tt = dfs_tt[i]["SalePrice"]

    prices_ho = dfs_ho[i]["SalePrice"]

    cols1 = num_med_cols + num_mode_cols

    cols2 = ohe_transformed_tt.columns.to_list()

    cols3 = ["SalePrice"]

    dfs_tt[i] = pd.concat(
        [
            id_tt.reset_index(drop=True),
            dfs_tt[i][cols1].reset_index(drop=True),
            ohe_transformed_tt.reset_index(drop=True),
            prices_tt.reset_index(drop=True),
        ],
        axis=1,
        ignore_index=True,
    )

    dfs_tt[i].columns = ["Id"] + cols1 + [str(x) for x in cols2] + cols3

    dfs_ho[i] = pd.concat(
        [
            id_ho.reset_index(drop=True),
            dfs_ho[i][cols1].reset_index(drop=True),
            ohe_transformed_ho.reset_index(drop=True),
            prices_ho.reset_index(drop=True),
        ],
        axis=1,
        ignore_index=True,
    )

    dfs_ho[i].columns = ["Id"] + cols1 + [str(x) for x in cols2] + cols3

    new_cat_cols.append([str(x) for x in cols2])



# Make the cat cols pipeline



pipe_cat = MCA(n_components=len(new_cat_cols[i]) - 1, one_hot=False)

In [7]:
# Run the pipelines (tried ColumnTransformer and it was problematic)
n_comps_list = []
for i in range(n_splits):
    tt_num_med_transf = pd.DataFrame(
        pipe_num_med.fit_transform(dfs_tt[i][num_med_cols])
    )
    ho_num_med_transf = pd.DataFrame(pipe_num_med.transform(dfs_ho[i][num_med_cols]))
    tt_num_mode_transf = pd.DataFrame(
        pipe_num_mode.fit_transform(dfs_tt[i][num_mode_cols])
    )
    ho_num_mode_transf = pd.DataFrame(pipe_num_mode.transform(dfs_ho[i][num_mode_cols]))
    tt_cat_tranf = pd.DataFrame(pipe_cat.fit_transform(dfs_tt[i][new_cat_cols[i]]))
    ho_cat_tranf = pd.DataFrame(pipe_cat.transform(dfs_ho[i][new_cat_cols[i]]))
    dfs_tt[i] = pd.concat(
        [
            dfs_tt[i]["Id"],
            tt_num_med_transf,
            tt_num_mode_transf,
            tt_cat_tranf,  # dfs_tt[i][new_cat_cols[i]],
            dfs_tt[i]["SalePrice"],
        ],
        axis=1,
    )
    dfs_ho[i] = pd.concat(
        [
            dfs_ho[i]["Id"],
            ho_num_med_transf,
            ho_num_mode_transf,
            ho_cat_tranf,  # dfs_ho[i][new_cat_cols[i]],
            dfs_ho[i]["SalePrice"],
        ],
        axis=1,
    )

    # Determine the number of components to keep from MCA
    exp_var = 0.8
    eigenvalues = pipe_cat.eigenvalues_
    total_inertia = eigenvalues.sum()
    evr_list = eigenvalues / total_inertia
    exp_var_sum = 0
    n_comp = 0
    for j in range(len(evr_list)):
        exp_var_sum = exp_var_sum + evr_list[j]
        if exp_var_sum <= exp_var:
            n_comp += 1
        else:
            break
    n_comps_list.append(n_comp)

# Cut the extraneous features in the categorical dataframe
# Standardizing number of columns across folds
min_cols = 500
for i in range(5):
    min_cols = min(min_cols, dfs_tt[i].shape[1], dfs_ho[i].shape[1])

n_cols = 1 + len_num_cols + max(n_comps_list)  # (min_cols-1-len_num_cols)
for i in range(n_splits):
    dfs_tt[i] = dfs_tt[i].iloc[:, list(range(n_cols)) + [-1]]
    dfs_tt[i].columns = (
        ["Id"]
        + num_med_cols
        + num_mode_cols
        + list(range(max(n_comps_list)))
        + ["SalePrice"]
    )
    dfs_ho[i] = dfs_ho[i].iloc[:, list(range(n_cols)) + [-1]]
    dfs_ho[i].columns = (
        ["Id"]
        + num_med_cols
        + num_mode_cols
        + list(range(max(n_comps_list)))
        + ["SalePrice"]
    )

In [8]:
# Save the preprocessed data
for i in range(n_splits):
    dfs_tt[i].to_csv(f"preproc_tt_fold_{i}.csv", index=False)
    dfs_ho[i].to_csv(f"preproc_ho_fold_{i}.csv", index=False)

In [9]:
# Print the shape for input size in the neural network
for i in range(5):

    print(dfs_tt[i].shape)

    print(dfs_ho[i].shape)

(1168, 142)
(292, 142)
(1168, 142)
(292, 142)
(1168, 142)
(292, 142)
(1168, 142)
(292, 142)
(1168, 142)
(292, 142)
