## Analysing Missing Data Impact on well known Datasets

In [443]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the datasets to be used

In [444]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wenruliu/adult-income-dataset") + "/adult.csv"
adult_df = pd.read_csv(path)
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [445]:
adult_df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [446]:
# Download latest version
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data") 
path = path + "/data.csv"
breast_cancer_df= pd.read_csv(path)
breast_cancer_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [447]:
breast_cancer_df.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [448]:
breast_cancer_df.columns
breast_cancer_df = breast_cancer_df.iloc[:, :-1]

We can predefine a missing rate to be used across the tutorial:

In [449]:
# Try 0.1, 0.3, and 0.5.
MISSING_RATE = 30

## Defining the helpers that will be used in model evaluation along Imputers

Firstly it was implemented a function to introduce the Missingness Completely At Random (MCAR) missingness.

In [450]:
from mdatagen.univariate.uMCAR import uMCAR
from mdatagen.univariate.uMAR import uMAR
from mdatagen.univariate.uMNAR import uMNAR
import pandas as pd

def generate_univariate_missingness(X, y, mechanism, missing_rate, x_miss, x_obs=None, seed=None):
    """
    General method to generate univariate missingness using the mdatagen library.
    
    Parameters:
    - X (pd.DataFrame): The feature matrix (independent variables).
    - y (pd.Series or np.ndarray): The target variable.
    - mechanism (str): The missingness mechanism ('MCAR', 'MAR', or 'MNAR').
    - missing_rate (float): The proportion of values to replace with NaN (0.0 to 1.0).
    - x_miss (str): The name of the column to introduce missingness into.
    - x_obs (str, optional): The name of the column to condition on for MAR/MNAR mechanisms (if required).
    - seed (int, optional): Random seed for reproducibility.

    Returns:
    - pd.DataFrame: A DataFrame of features with missing values introduced.
    - pd.Series or np.ndarray: The unchanged target variable.
    """
    # Ensure the specified column exists
    if x_miss not in X.columns:
        raise ValueError(f"Column '{x_miss}' not found in the feature matrix.")

    # Set the random seed for reproducibility
    seed = seed or 42

    # Initialize the appropriate generator based on the mechanism
    if mechanism == 'MCAR':
        generator = uMCAR(X=X, y=y, missing_rate=missing_rate, x_miss=x_miss, seed=seed)
    elif mechanism == 'MAR':
        if not x_obs:
            raise ValueError("For MAR, you must specify the observed column 'x_obs'.")
        generator = uMAR(X=X, y=y, missing_rate=missing_rate, x_miss=x_miss, x_obs=x_obs)
    elif mechanism == 'MNAR':
        generator = uMNAR(X=X, y=y, missing_rate=missing_rate, x_miss=x_miss)
    else:
        raise ValueError(f"Invalid mechanism '{mechanism}'. Choose from 'MCAR', 'MAR', or 'MNAR'.")

    # Generate the missing data
    if mechanism == 'MCAR':
        X_missing = generator.random()
    if mechanism == 'MAR':
        X_missing = generator.rank()
    if mechanism == 'MNAR':
        X_missing = generator.run()
    # Display missingness details
    global_missing_rate = X_missing.isnull().sum().sum() / X_missing.size
    print(f"Global Missing Rate = {global_missing_rate * 100:.2f}%")
    print("Missing values per column:")
    print(X_missing.isnull().sum())

    return X_missing

Then, it is defined a helper `run_logistic_regression` function train, that evaluates a logistic regression model by imputing missing values in the training set and testing data using a specified imputer, fitting the model on the training data, predicting the labels for the testing data, and returning the predictions along with a classification report.

In [451]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def run_logistic_regression(X_train, X_test, y_train, y_test, imputer):
    """
    Runs a Logistic Regression model with preprocessing and a given imputer.

    Parameters:
    - X_train, X_test: Feature matrices for training and testing.
    - y_train, y_test: Target vectors for training and testing.
    - imputer: An imputer instance for handling missing values.

    Returns:
    - y_pred: Predicted labels on the test set.
    - clf_report: Classification report as a string.
    """
    # Separate columns by data type
    numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns

    # Numerical pipeline
    numerical_pipeline = Pipeline([
        ('imputer', imputer),  # Use the passed imputer (e.g., KNNImputer or SimpleImputer)
        ('scaler', StandardScaler())  # Standardize numerical features
    ])

    # Categorical pipeline (using OrdinalEncoder instead of OneHotEncoder)
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
        ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Ordinal encoding for categorical features)  
    ])

    # Combine the numerical and categorical pipelines
    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ])

    # Full pipeline with preprocessing and logistic regression
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=200))
    ])

    # Fit the model pipeline on the training data
    model_pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model with a classification report
    clf_report = classification_report(y_test, y_pred, output_dict=False)
    print("\nClassification Report:")
    print(clf_report)

    return y_pred, clf_report

Correlations calculation heatmap to Adult Dataset (ignore the following cell):

In [452]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats as stat

# Function to check if a column is categorical
def is_categorical(df, column):
    """Check if a column in a DataFrame is categorical."""
    return pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_categorical_dtype(df[column])

# Cramér's V function for categorical-categorical association, with missing value handling
def cramers_v(x, y):
    # Drop rows where either column has missing data
    valid = pd.notnull(x) & pd.notnull(y)
    x_clean, y_clean = x[valid], y[valid]

    confusion_matrix = pd.crosstab(x_clean, y_clean)
    chi2 = stat.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

# Spearman's rank correlation function, with missing value handling
def spearmans_rank(x, y):
    """Calculate Spearman's rank correlation for continuous or ordinal variables, handling missing values."""
    # Drop rows where either column has missing data
    valid = pd.notnull(x) & pd.notnull(y)
    x_clean, y_clean = x[valid], y[valid]
    
    return stat.spearmanr(x_clean, y_clean)[0]  # Returns the correlation coefficient

# Discretize numeric features
def discretize_numeric(series, method="equal_width", bins=4):
    if method == "equal_width":
        # Equal-width binning
        return pd.cut(series, bins=bins, labels=False, include_lowest=True)
    elif method == "quantile":
        # Equal-frequency binning (quantiles)
        return pd.qcut(series, q=bins, labels=False)
    else:
        raise ValueError(f"Unknown discretization method: {method}")

# Cramér's V with discretized numeric features
def cramers_v_discretized(x, y, method="equal_width", bins=4):
    # Determine which column is numeric and which is categorical
    if pd.api.types.is_numeric_dtype(x):
        x_discretized = discretize_numeric(x, method=method, bins=bins)
        return cramers_v(x_discretized, y)
    elif pd.api.types.is_numeric_dtype(y):
        try:
            y_discretized = discretize_numeric(y, method=method, bins=bins)
            return cramers_v(x, y_discretized)
        except Exception as e:
            print(e)
    else:
        raise ValueError("At least one of the inputs should be numeric for discretization.")

# Define the function to calculate the correlation based on data types
def calculate_correlations(df, col1, col2):
    """Calculate correlation between two variables depending on their types."""
    if is_categorical(df, col1) and is_categorical(df, col2):
        # Use Cramér's V for categorical-categorical variables
        return cramers_v(df[col1], df[col2])
    elif pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
        # Use Spearman's rank correlation for continuous-continuous or ordinal variables
        return spearmans_rank(df[col1], df[col2])
    elif (pd.api.types.is_numeric_dtype(df[col1]) and is_categorical(df, col2)) or \
         (is_categorical(df, col1) and pd.api.types.is_numeric_dtype(df[col2])):
        # Use Cramér's V with discretized numeric variables
        return cramers_v_discretized(df[col1], df[col2])
    # Return NaN when there was an error on calculating the correlation
    return np.nan

# Correlation matrix calculation for the entire dataset
def correlations(df):
    df_columns = df.columns
    correlation_matrix = pd.DataFrame(index=df_columns, columns=df_columns)

    for i, col1 in enumerate(df_columns):
        for col2 in df_columns[i+1:]:  # Use i+1 to avoid duplicate pairs and self-correlation
            a_corr = calculate_correlations(df, col1, col2)
            correlation_matrix.loc[col1, col2] = a_corr
            correlation_matrix.loc[col2, col1] = a_corr 

    correlation_matrix = correlation_matrix.apply(pd.to_numeric, errors='coerce')

    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation Matrix for Adult Dataset')
    plt.savefig("adult_correlations.png", dpi=300, bbox_inches='tight')
    plt.show()



## Configuring the target data for each dataset

In [453]:


X_adult = adult_df.drop(["income"], axis=1)
y_adult = adult_df["income"]

X_breast_cancer = breast_cancer_df.drop(["diagnosis"], axis=1)
y_breast_cancer = breast_cancer_df["diagnosis"]


## Imputing Missingness via Missign Completely At Random (MCAR) mechanism

In [454]:
X_missing = generate_univariate_missingness(X_breast_cancer, y_breast_cancer.values, mechanism='MCAR', missing_rate=30, x_miss='radius_mean', seed=42)

Global Missing Rate = 0.94%
Missing values per column:
id                           0
radius_mean                171
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimensi

We then split the dataset `X` and labels `y` into training and test sets (`X_train`, `X_test`, `y_train`, `y_test`) while preserving the original class distribution using stratified sampling.

In [455]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_breast_cancer, stratify=y_breast_cancer)

In [456]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))



Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



## **Exploring kNN Imputation**

Rather than the mean, we can decide on a `KNNImputer` with `k=5` to fill missing values. Similarly, we train a logistic regression model using the imputed data, and print the classification report.

In [457]:

y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))



Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [458]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))




Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



## Breast Cancer Dataset

In [459]:
X_missing = generate_univariate_missingness(X_breast_cancer, y_breast_cancer.values , mechanism='MCAR',missing_rate=MISSING_RATE,x_miss='compactness_worst', seed=42)


Global Missing Rate = 0.01%
Missing values per column:
id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          2
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
target                     0
dtype: int64


In [460]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_breast_cancer.values, stratify=y_breast_cancer)

In [461]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [462]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [463]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



## Imputing Univariate Missingness via Missign At Random (MAR) mechanism

### Breast Cancer Dataset

In [464]:
X_breast_cancer_ = X_breast_cancer.copy()

In [465]:
'''
correlation_matrix = X_breast_cancer.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap for Breast Cancer Dataset")

# Save the heatmap as an image file
plt.savefig("breast_cancer_correlation_heatmap.png", dpi=300, bbox_inches='tight')

# Display the heatmap
plt.show()
'''

'\ncorrelation_matrix = X_breast_cancer.corr()\n\n# Plot heatmap\nplt.figure(figsize=(12, 8))\nsns.heatmap(correlation_matrix, annot=True, cmap=\'coolwarm\')\nplt.title("Feature Correlation Heatmap for Breast Cancer Dataset")\n\n# Save the heatmap as an image file\nplt.savefig("breast_cancer_correlation_heatmap.png", dpi=300, bbox_inches=\'tight\')\n\n# Display the heatmap\nplt.show()\n'

In [466]:
X_missing = generate_univariate_missingness(X=X_breast_cancer_,y=y_breast_cancer.values,mechanism='MAR',missing_rate=MISSING_RATE, x_miss='compactness_worst', x_obs='concavity_worst')

Global Missing Rate = 0.01%
Missing values per column:
id                         0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          2
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
target                     0
dtype: int64


In [467]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_breast_cancer, stratify=y_breast_cancer)

In [468]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [469]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [470]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



### Adult Dataset

In [471]:
X_adult_ = X_adult.copy()

In [472]:
#correlations(adult_df)

In [473]:
X_missing = generate_univariate_missingness(X=X_adult_,y=y_adult.values,mechanism='MAR',missing_rate=MISSING_RATE, x_miss='relationship', x_obs='gender')

Global Missing Rate = 0.02%
Missing values per column:
age                  0
workclass            0
fnlwgt               0
education            0
educational-num      0
marital-status       0
occupation           0
relationship       147
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country       0
target               0
dtype: int64


In [474]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_adult, stratify=y_adult)

In [475]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))


Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211



In [476]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211



In [477]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))


Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211



## Imputing Univariate Missingness via Missign Not At Random (MNAR) mechanism

### Breast Cancer Dataset

In [478]:
X = X_breast_cancer.copy()
y = y_breast_cancer

X_missing = generate_univariate_missingness(X=X, y=y.values, missing_rate=MISSING_RATE, mechanism='MNAR', x_miss='concavity_worst')

Global Missing Rate = 0.94%
Missing values per column:
id                           0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst            171
concave points_worst         0
symmetry_worst               0
fractal_dimensi

In [479]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, stratify=y)

In [480]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [481]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [482]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))


Classification Report:
              precision    recall  f1-score   support

           B       1.00      1.00      1.00        90
           M       1.00      1.00      1.00        53

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



### Adult Dataset

In [487]:
X = X_adult.copy()
y = y_adult

In [None]:
X_missing = generate_univariate_missingness(X=X, y=y.values, mechanism='MNAR',missing_rate=MISSING_RATE,  x_miss='relationship')

Global Missing Rate = 2.00%
Missing values per column:
age                    0
workclass              0
fnlwgt                 0
education              0
educational-num        0
marital-status         0
occupation             0
relationship       14653
race                   0
gender                 0
capital-gain           0
capital-loss           0
hours-per-week         0
native-country         0
target                 0
dtype: int64


In [496]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, stratify=y)


In [497]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))



Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211



In [498]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211



In [499]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))


Classification Report:
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      9289
        >50K       1.00      1.00      1.00      2922

    accuracy                           1.00     12211
   macro avg       1.00      1.00      1.00     12211
weighted avg       1.00      1.00      1.00     12211

