In [1]:
import numpy as np
import pandas as pd
import warnings
import pickle
import arff
import random

from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Set seed for Python's random module
random.seed(42)
# Set seed for NumPy
np.random.seed(42)

In [3]:
with open("C:/Users/marti/Desktop/DNN_missing_data/public_procurement/dataset.arff", 'r', encoding="utf-8") as f:
    dataset = arff.load(f)

df = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

In [4]:
Y = df['award_value_euro']
Y = np.log(Y - np.min(Y) + 1)

df_numerical = df.select_dtypes(include=['number'])
Z = df_numerical.drop('award_value_euro', axis=1)
Omega = Z.notna()

In [5]:
# Mean imputation
Z_MI = Z.fillna(Z.mean())
scaler = StandardScaler()
Z_MI = pd.DataFrame(scaler.fit_transform(Z_MI), columns=Z_MI.columns)

In [6]:
# Missforest imputation
MF_imputer = MissForest(RandomForestRegressor(n_estimators=10, n_jobs=-1), early_stopping=True)
Z_MF = MF_imputer.fit_transform(Z)
scaler = StandardScaler()
Z_MF = pd.DataFrame(scaler.fit_transform(Z_MF), columns=Z_MF.columns)

 40%|████      | 2/5 [03:03<04:34, 91.59s/it]
100%|██████████| 2/2 [00:13<00:00,  6.86s/it]


In [7]:
# Iterative imputer
II_imputer = IterativeImputer(max_iter=5)
Z_II = II_imputer.fit_transform(Z)
scaler = StandardScaler()
Z_II = pd.DataFrame(scaler.fit_transform(Z_II), columns=Z.columns)



In [8]:
# Store the data in public_procurement_data.pkl
public_procurement_data = {'Z_MI': Z_MI,
                     'Z_MF': Z_MF,
                     'Z_II': Z_II,
                     'Omega': Omega,
                     'Y': Y}

with open("C:/Users/marti/Desktop/DNN_missing_data/public_procurement/public_procurement_data.pkl", "wb") as f:
    pickle.dump(public_procurement_data, f)