In [None]:
import numpy as np
import pandas as pd
import warnings
import pickle
import random

from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# Set seed for Python's random module
random.seed(2025)
# Set seed for NumPy
np.random.seed(2025)

In [None]:
# Read data
train_data = pd.read_csv("CreditScore_train.csv")
test_data = pd.read_csv("CreditScore_test.csv")

In [None]:
Z = pd.concat([train_data.iloc[:,0:304], test_data.iloc[:,0:304]], ignore_index=True)
Y = pd.concat([train_data.iloc[:,304], test_data.iloc[:,304]], ignore_index=True)
Omega = 1 - Z.isna()

In [None]:
# Indices for 72000 training data, 8000 validation data and 20000 testing data
val_ind = np.random.choice(range(80000), size=8000, replace=False)
train_ind = [i for i in range(80000) if i not in val_ind]
test_ind = range(80000, 100000)

In [None]:
Y_train = Y[train_ind]
Y_val = Y[val_ind]
Y_test = Y[test_ind]

Omega_train = Omega.iloc[train_ind,:]
Omega_val = Omega.iloc[val_ind,:]
Omega_test = Omega.iloc[test_ind,:]

In [None]:
# Standardise Z
scaler = StandardScaler()
Z = pd.DataFrame(scaler.fit_transform(Z), columns=Z.columns)

In [None]:
# Zero imputation
Z_ZI = Z.fillna(0)
Z_ZI_train = Z_ZI.iloc[train_ind,:]
Z_ZI_val = Z_ZI.iloc[val_ind,:]
Z_ZI_test = Z_ZI.iloc[test_ind,:]

In [None]:
# Missforest imputation
mf_imputer = MissForest(RandomForestRegressor(n_estimators=10, n_jobs=-1), early_stopping=True)
Z_MF = mf_imputer.fit_transform(Z)
Z_MF_train = Z_MF.iloc[train_ind,:]
Z_MF_val = Z_MF.iloc[val_ind,:]
Z_MF_test = Z_MF.iloc[test_ind,:]

In [None]:
# Mice imputation
mice_imputer = IterativeImputer(max_iter=5)
Z_MICE = pd.DataFrame(mice_imputer.fit_transform(Z), columns=Z.columns)
Z_MICE_train = Z_MICE.iloc[train_ind,:]
Z_MICE_val = Z_MICE.iloc[val_ind,:]
Z_MICE_test = Z_MICE.iloc[test_ind,:]

In [None]:
# Store the data in credit_score_data.pkl
credit_score_data = {'Z_ZI_train': Z_ZI_train,
                     'Z_ZI_val': Z_ZI_val,
                     'Z_ZI_test': Z_ZI_test,
                     'Z_MF_train': Z_MF_train,
                     'Z_MF_val': Z_MF_val,
                     'Z_MF_test': Z_MF_test,
                     'Z_MICE_train': Z_MICE_train,
                     'Z_MICE_val': Z_MICE_val,
                     'Z_MICE_test': Z_MICE_test,
                     'Omega_train': Omega_train,
                     'Omega_val': Omega_val,
                     'Omega_test': Omega_test,
                     'Y_train': Y_train,
                     'Y_val': Y_val,
                     'Y_test': Y_test}

with open("credit_score_data.pkl", "wb") as f:
    pickle.dump(credit_score_data, f)