In [None]:
import numpy as np
import pandas as pd
import warnings
import pickle
import arff    # this is liac-arff package, use pip install liac-arff; make sure that the arff package is not installed (use pip uninstall arff)
import random

from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# Set seed for Python's random module
random.seed(42)
# Set seed for NumPy
np.random.seed(42)

In [None]:
with open("dataset.arff", 'r', encoding="utf-8") as f:
    dataset = arff.load(f)

df = pd.DataFrame(dataset['data'], columns=[attr[0] for attr in dataset['attributes']])

In [None]:
Y = df['award_value_euro']
Y = np.log(Y - np.min(Y) + 1)

df_numerical = df.select_dtypes(include=['number'])
Z = df_numerical.drop('award_value_euro', axis=1)
Omega = Z.notna()

In [None]:
# Mean imputation
Z_MI = Z.fillna(Z.mean())
scaler = StandardScaler()
Z_MI = pd.DataFrame(scaler.fit_transform(Z_MI), columns=Z_MI.columns)

In [None]:
# Missforest imputation
MF_imputer = MissForest(RandomForestRegressor(n_estimators=10, n_jobs=-1), early_stopping=True)
Z_MF = MF_imputer.fit_transform(Z)
scaler = StandardScaler()
Z_MF = pd.DataFrame(scaler.fit_transform(Z_MF), columns=Z_MF.columns)

In [None]:
# Iterative imputer
II_imputer = IterativeImputer(max_iter=5)
Z_II = II_imputer.fit_transform(Z)
scaler = StandardScaler()
Z_II = pd.DataFrame(scaler.fit_transform(Z_II), columns=Z.columns)

In [None]:
# Store the data in public_procurement_data.pkl
public_procurement_data = {'Z_MI': Z_MI,
                     'Z_MF': Z_MF,
                     'Z_II': Z_II,
                     'Omega': Omega,
                     'Y': Y}

with open("C:/Users/marti/Desktop/DNN_missing_data/public_procurement/public_procurement_data.pkl", "wb") as f:
    pickle.dump(public_procurement_data, f)