In [1]:
# set working directory
import os
os.chdir("C:/Users/yvjennig/PycharmProjects/phd_repos/clustering/")
print(os.getcwd())

C:\Users\yvjennig\PycharmProjects\phd_repos\clustering


In [2]:
import pandas as pd
import pickle
import time
from sklearn.preprocessing import RobustScaler, MinMaxScaler
import itertools as it
import sys

# Training on complete training dataset

Now that we know the optimal combination of hyperparameters (for each setting of missing_value_proportion, imputer, predicting), we can train a model for each combination. This is now done on the complete training set.

We need experiments 0 and 1 for training. 

In [3]:
# load result files
scaler = "minmaxscaler"
output_path = f"output/{scaler}/"
model_path = f"{output_path}/models/"

df_config = pd.read_csv(f"{output_path}/train_results/config.csv")
df = pd.read_csv(f"{output_path}/train_results/tuning_results.csv")
dfres = pd.read_csv(f"{output_path}/train_results/optimal_hyperparameters.csv")

train_defaults = True

In [4]:
# create dir for models
if not os.path.exists(model_path):
    os.makedirs(model_path)

In [5]:
# load data
df_test = pd.read_csv("data/test_table_0.8.csv")
df_train = pd.read_csv("data/train_table_0.8.csv")

# define, fit and store scaler
if scaler == "robustscaler": 
    scaler = RobustScaler().fit(df_train)
elif scaler == "minmaxscaler": 
    scaler = MinMaxScaler().fit(df_train)
else: 
    print("Unknown scaler!")

# store scaler model
scaler_filename = f"{model_path}scaler.pickle"
pickle.dump(scaler, open(scaler_filename, 'wb')) 

# scale data
df_test_scaled = pd.DataFrame(scaler.transform(df_test), columns=df_test.columns)
df_train_scaled = pd.DataFrame(scaler.transform(df_train), columns=df_train.columns)

# get parameters
parameters = list(filter(lambda x: x.startswith('P_'), list(df_test.columns)))

In [6]:
# get all hyperparameter combinations to train per imputer
hyperparameter_combinations = {}

for imputer in dfres["imputer"].unique():
    temp = dfres[dfres["imputer"] == imputer]

    hyps = list(set(it.chain(temp["hyperparameters"].unique())))  # make sure that each combo exists only once
    df_hyper = pd.DataFrame(hyps, columns=["hyperparameters"])  # put combinations into a dataframe (they are strings!)
    dicts = df_hyper["hyperparameters"].apply(eval)  # transform strings to dicts
    hyperparameter_combinations[imputer] = dicts

In [7]:
def run_missforest(hyps):
    import sklearn.neighbors._base
    sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
    from missingpy import MissForest

    return MissForest(**hyps)  

def run_gain(hyps):
#     # needed for GAIN
#     import tensorflow as tf
#     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
#     from GAIN import gain
#     x_hat = pd.DataFrame(gain.gain(df_train_scaled.to_numpy(), hyperparameter_combi), columns=df_train_scaled.columns)
#     return x_hat
    return 

def run_knn(hyps):
    from sklearn.impute import KNNImputer
    return KNNImputer(**hyps)

def run_mean(hyps):
    from sklearn.impute import SimpleImputer
    return SimpleImputer(**hyps)

def run_iterativeRidge(hyps):
    return IterativeImputer(**hyps)

In [12]:
# train one model per hyperparameter combination
for imputer_name, hyperparameter_combis in hyperparameter_combinations.items():
    print(imputer_name)
    for hyperparameter_combi in hyperparameter_combis:
        print(hyperparameter_combi)
        s_impute_time = time.time()

        if imputer_name == "gain_imputer":
            x_hat = run_gain(hyps)
        else:
            if imputer_name == "knn_imputer":
                imputer = run_knn(hyperparameter_combi)
            elif imputer_name == "missforest_imputer" or imputer_name == "missforest":
                imputer = run_missforest(hyperparameter_combi)
            elif imputer_name == "iterative_ridge":
                imputer = run_iterativeRidge(hyperparameter_combi)
            elif imputer_name == "mean_imputer": 
                imputer = run_mean(hyperparameter_combi)

            imputer.fit(df_train_scaled)

        print(f"    Training took: {time.time() - s_impute_time} s")

        filename = f"{model_path}/{imputer_name}_{'-'.join([str(k) + '=' + str(v) for k, v in hyperparameter_combi.items()])}.pickle"
        pickle.dump(imputer, open(filename, 'wb'))  # store model

knn_imputer
    Training took: 0.0019829273223876953 s
    Training took: 0.002008199691772461 s
    Training took: 0.0010120868682861328 s
    Training took: 0.001999378204345703 s
    Training took: 0.002008676528930664 s
    Training took: 0.0025267601013183594 s
    Training took: 0.0020067691802978516 s
mean_imputer
    Training took: 0.0020093917846679688 s
missforest
    Training took: 0.004007101058959961 s
    Training took: 0.003000020980834961 s
    Training took: 0.003996610641479492 s
    Training took: 0.003000020980834961 s
    Training took: 0.0019998550415039062 s
    Training took: 0.0030088424682617188 s
    Training took: 0.003007173538208008 s
    Training took: 0.004006862640380859 s
gain_imputer
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0 s
    Training took: 0.0

In [19]:
# train default models
for imputer_name in dfres["imputer"].unique():
    if train_defaults and imputer_name != "gain_imputer":
        if imputer_name == "knn_imputer":
            imputer = run_knn({})
        elif imputer_name == "missforest_imputer" or imputer_name=="missforest":
            imputer = run_missforest({})  
        elif imputer_name == "iterative_ridge":
            imputer = run_iterative_ridge({})
        elif imputer_name == "mean_imputer": 
            imputer = run_mean({"strategy": "mean"})

        imputer.fit(df_train_scaled)
        print(imputer)

        filename = f"{model_path}{imputer_name}_default.pickle"
        pickle.dump(imputer, open(filename, 'wb'))  # store model

KNNImputer()
SimpleImputer()
MissForest()
