In [1]:
import pandas as pd
import numpy as np
from shapiq_student.gaussion_copula_imputer import GaussianCopulaImputer


In [2]:
data = {
    "age": [48.935913, 39.234323, 55.659901, 31.810637, 65.342336],
    "income": [399.161393, 364.225531, 406.475105, 341.276022, 414.347815]
}

In [3]:
age_data = pd.DataFrame(data)

In [4]:
age_data.to_csv("data.csv", index=False)

In [5]:
age_data = pd.read_csv("data.csv")

In [6]:
X = age_data.to_numpy()
mask = np.zeros_like(X, dtype=bool)

In [7]:
X_train = X[:, 0].reshape(-1, 1)  # age als Feature
y_train = X[:, 1]

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [9]:
def trained_model(X_input: np.ndarray) -> np.ndarray:
    """Returns the age column as prediction.

    Args:
        X_input (np.ndarray): Input data.

    Returns: np.ndarray: Predicted age."""
    X_age = X_input[:, 0].reshape(-1, 1)
    return model.predict(X_age)

In [10]:
imputer = GaussianCopulaImputer(model=trained_model, data=X)

imputer.fit(X)

X_missing = X.copy()
mask = np.zeros_like(X_missing, dtype=bool)
mask[1, 1] = True
X_missing[1, 1] = np.nan

X_imputed = imputer.transform(X_missing, mask)



INFO:root:Original data with missing values:
INFO:root:[[ 48.935913 399.161393]
 [ 39.234323        nan]
 [ 55.659901 406.475105]
 [ 31.810637 341.276022]
 [ 65.342336 414.347815]]
INFO:root:Data after imputation:
INFO:root:[[ 48.935913   399.161393  ]
 [ 39.234323   414.34778351]
 [ 55.659901   406.475105  ]
 [ 31.810637   341.276022  ]
 [ 65.342336   414.347815  ]]
