# Simulating multiple imputation

You can simulate multiple imputation using HyperImpute, using multiple random seeds.

### Setup

In [1]:
import sys
import warnings

import numpy as np
import pandas as pd

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan


from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Loading the Imputation plugins

Make sure that you have installed HyperImpute in your workspace.

You can do that by running `pip install .` in the root of the repository.

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

### Load the dataset

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# third party
from pathlib import Path
def download_dataset() -> pd.DataFrame:
    Path("data").mkdir(parents=True, exist_ok=True)
    bkp_file = Path("data") / "anneal.csv"
    
    if bkp_file.exists():
        return pd.read_csv(bkp_file)
    
    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data",
         header=None,
    )
    df.to_csv(bkp_file, index = None)
    
    return df

def dataset(random_state: int = 0) -> pd.DataFrame:
    df = download_dataset()
    df = df.replace('?', np.nan)

    for col in df.columns:
        df.loc[df[col].notna(), col] = LabelEncoder().fit_transform(df.loc[df[col].notna(), col] )

    drop = []
    for col in df.columns:
        if len(df.loc[df[col].notna(), col].unique()) <= 1:
            drop.append(col)
            
    df = df.drop(columns = drop).astype(float)
    X = df.drop(columns = [df.columns[-1]])
    y = df[df.columns[-1]]

    X = pd.DataFrame(X)
    y = pd.Series(y)

    X.columns = X.columns.astype(str)
    return train_test_split(X, y, test_size=0.2, stratify = y, random_state = random_state)


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss, columns = x.columns), pd.DataFrame(mask, columns = x.columns)

In [8]:
ampute_mechanism = "MCAR"
p_miss = 0.5

## Load model

In [12]:
from sklearn import metrics
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin
import xgboost as xgb


metrics_headers = ["Seed", "AUROC"]
test_score = []

def get_metrics(X_train, y_train, X_test, y_test):
    xgb_clf = xgb.XGBClassifier(verbosity=0)
    xgb_clf = xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict_proba(X_test)

    auroc = metrics.roc_auc_score(
        y_test,
        y_pred,
        multi_class="ovr",
    )

    return auroc

plugin = "ice"

for seed in range(5):
    X_train, X_test, y_train, y_test = dataset(random_state = seed)
    x, x_miss, mask = ampute(X_train, ampute_mechanism, p_miss)

    model = Imputers().get(plugin, random_state = seed)
    X_train_imp = model.fit_transform(x_miss.copy()).astype(float)
    
    drop = []
    for col in X_test.columns:
        if col not in X_train_imp.columns:
            drop.append(col)
            
    X_test_eval = X_test.drop(columns = drop)
    assert X_train_imp.shape[1] == X_test_eval.shape[1]
    auroc = get_metrics(X_train_imp, y_train, X_test_eval, y_test)

    test_score.append([seed, auroc])  

In [None]:
display(
    HTML(
        tabulate.tabulate(test_score, headers=metrics_headers, tablefmt="html")
    )
)