## Analysing Missing Data Impact on well known Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the datasets to be used

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wenruliu/adult-income-dataset") + "/adult.csv"
adult_df = pd.read_csv(path)
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
adult_df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [4]:
# Download latest version
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data") 
path = path + "/data.csv"
breast_cancer_df= pd.read_csv(path)
breast_cancer_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
breast_cancer_df.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

## Preprocessing the datasets

### Breast-Cancer Preprocessing

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Copy the dataset and remove the last column
breast_cancer_df = breast_cancer_df.iloc[:, :-1]

# Identify numeric and categorical columns
numeric_columns = breast_cancer_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = breast_cancer_df.select_dtypes(include=['object', 'category']).columns
# Initialize StandardScaler
scaler = StandardScaler()

# Scale the numeric columns
breast_cancer_df[numeric_columns] = scaler.fit_transform(breast_cancer_df[numeric_columns])

# Initialize LabelEncoder for categorical columns
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    breast_cancer_df[col] = label_encoders[col].fit_transform(breast_cancer_df[col].astype(str))

# Display the processed DataFrame
breast_cancer_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,-0.236405,1,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,-0.236403,1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,0.431741,1,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,0.432121,1,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,0.432201,1,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


### Adult Dataset Preprocessing

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Identify column types
binary_columns = [col for col in adult_df.select_dtypes(include=['object', 'category']).columns 
                  if adult_df[col].nunique() == 2]
non_binary_columns = [col for col in adult_df.select_dtypes(include=['object', 'category']).columns 
                      if adult_df[col].nunique() > 2]
numeric_columns = adult_df.select_dtypes(include=['int64', 'float64']).columns

# Initialize LabelEncoder for binary columns
label_encoders = {}
for col in binary_columns:
    label_encoders[col] = LabelEncoder()
    adult_df[col] = label_encoders[col].fit_transform(adult_df[col].astype(str))

# Apply One-Hot Encoding for non-binary columns
adult_df = pd.get_dummies(adult_df, columns=non_binary_columns, drop_first=True)

adult_df.head()

Unnamed: 0,age,fnlwgt,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,1,0,0,40,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,1,0,0,50,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,1,0,0,40,1,False,True,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,1,7688,0,40,1,False,False,...,False,False,False,False,False,False,False,True,False,False
4,18,103497,10,0,0,0,30,0,False,False,...,False,False,False,False,False,False,False,True,False,False


We can predefine a missing rate to be used across the tutorial:

In [8]:
# Try 0.1, 0.3, and 0.5.
MISSING_RATE = 30

## Defining the helpers that will be used in model evaluation along Imputers

Firstly it was implemented a function to introduce the Missingness Completely At Random (MCAR) missingness.

In [9]:
from mdatagen.multivariate.mMCAR import mMCAR
from mdatagen.multivariate.mMAR import mMAR
from mdatagen.multivariate.mMNAR import mMNAR
import pandas as pd

def generate_multivariate_missingness(X, y, mechanism, missing_rate):
    """
    General method to generate multivariate missingness using the mdatagen library.
    
    Parameters:
    - X (pd.DataFrame): The feature matrix (independent variables).
    - y (pd.Series or np.ndarray): The target variable.
    - mechanism (str): The missingness mechanism ('MCAR', 'MAR', or 'MNAR').
    - missing_rate (float): The proportion of values to replace with NaN (0.0 to 1.0).
    - x_miss_list (list of str): The names of the columns to introduce missingness into.
    - x_obs_list (list of str, optional): The names of the columns to condition on for MAR/MNAR mechanisms (if required).
    - seed (int, optional): Random seed for reproducibility.

    Returns:
    - pd.DataFrame: A DataFrame of features with missing values introduced.
    - pd.Series or np.ndarray: The unchanged target variable.
    """

    # Initialize the appropriate generator based on the mechanism
    if mechanism == 'MCAR':
        generator = mMCAR(X=X, y=y,missing_rate=missing_rate)
    elif mechanism == 'MAR':
        generator = mMAR(X=X, y=y)
    elif mechanism == 'MNAR':
        generator = mMNAR(X=X, y=y)
    else:
        raise ValueError(f"Invalid mechanism '{mechanism}'. Choose from 'MCAR', 'MAR', or 'MNAR'.")

    # Generate the missing data
    if mechanism == 'MCAR':
        X_missing = generator.random()
    elif mechanism == 'MAR':
        X_missing = generator.correlated(missing_rate=missing_rate)
    elif mechanism == 'MNAR':
        X_missing = generator.correlated(missing_rate=missing_rate)
    
    # Display missingness details
    global_missing_rate = X_missing.isnull().sum().sum() / X_missing.size
    print(f"Global Missing Rate = {global_missing_rate * 100:.2f}%")
    print("Missing values per column:")
    print(X_missing.isnull().sum())

    return X_missing

Then, it is defined a helper `run_logistic_regression` function train, that evaluates a logistic regression model by imputing missing values in the training set and testing data using a specified imputer, fitting the model on the training data, predicting the labels for the testing data, and returning the predictions along with a classification report.

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

def run_logistic_regression(X_train_lr, X_test_lr, y_train_lr, y_test_lr, imputer):
    X_train_lr = imputer.fit_transform(X_train_lr)
    X_test_lr = imputer.fit_transform(X_test_lr)        # Maybe change to fit on training?

    clf = DecisionTreeClassifier()
    clf.fit(X_train_lr, y_train_lr)
    y_pred = clf.predict(X_test_lr)

    report = classification_report(y_test_lr, y_pred)

    print(report)
    return y_pred, report

Correlations calculation heatmap to Adult Dataset (ignore the following cell):

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats as stat

# Function to check if a column is categorical
def is_categorical(df, column):
    """Check if a column in a DataFrame is categorical."""
    return pd.api.types.is_object_dtype(df[column]) or pd.api.types.is_categorical_dtype(df[column])

# Cramér's V function for categorical-categorical association, with missing value handling
def cramers_v(x, y):
    # Drop rows where either column has missing data
    valid = pd.notnull(x) & pd.notnull(y)
    x_clean, y_clean = x[valid], y[valid]

    confusion_matrix = pd.crosstab(x_clean, y_clean)
    chi2 = stat.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

# Spearman's rank correlation function, with missing value handling
def spearmans_rank(x, y):
    """Calculate Spearman's rank correlation for continuous or ordinal variables, handling missing values."""
    # Drop rows where either column has missing data
    valid = pd.notnull(x) & pd.notnull(y)
    x_clean, y_clean = x[valid], y[valid]
    
    return stat.spearmanr(x_clean, y_clean)[0]  # Returns the correlation coefficient

# Discretize numeric features
def discretize_numeric(series, method="equal_width", bins=4):
    if method == "equal_width":
        # Equal-width binning
        return pd.cut(series, bins=bins, labels=False, include_lowest=True)
    elif method == "quantile":
        # Equal-frequency binning (quantiles)
        return pd.qcut(series, q=bins, labels=False)
    else:
        raise ValueError(f"Unknown discretization method: {method}")

# Cramér's V with discretized numeric features
def cramers_v_discretized(x, y, method="equal_width", bins=4):
    # Determine which column is numeric and which is categorical
    if pd.api.types.is_numeric_dtype(x):
        x_discretized = discretize_numeric(x, method=method, bins=bins)
        return cramers_v(x_discretized, y)
    elif pd.api.types.is_numeric_dtype(y):
        try:
            y_discretized = discretize_numeric(y, method=method, bins=bins)
            return cramers_v(x, y_discretized)
        except Exception as e:
            print(e)
    else:
        raise ValueError("At least one of the inputs should be numeric for discretization.")

# Define the function to calculate the correlation based on data types
def calculate_correlations(df, col1, col2):
    """Calculate correlation between two variables depending on their types."""
    if is_categorical(df, col1) and is_categorical(df, col2):
        # Use Cramér's V for categorical-categorical variables
        return cramers_v(df[col1], df[col2])
    elif pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
        # Use Spearman's rank correlation for continuous-continuous or ordinal variables
        return spearmans_rank(df[col1], df[col2])
    elif (pd.api.types.is_numeric_dtype(df[col1]) and is_categorical(df, col2)) or \
         (is_categorical(df, col1) and pd.api.types.is_numeric_dtype(df[col2])):
        # Use Cramér's V with discretized numeric variables
        return cramers_v_discretized(df[col1], df[col2])
    # Return NaN when there was an error on calculating the correlation
    return np.nan

# Correlation matrix calculation for the entire dataset
def correlations(df):
    df_columns = df.columns
    correlation_matrix = pd.DataFrame(index=df_columns, columns=df_columns)

    for i, col1 in enumerate(df_columns):
        for col2 in df_columns[i+1:]:  # Use i+1 to avoid duplicate pairs and self-correlation
            a_corr = calculate_correlations(df, col1, col2)
            correlation_matrix.loc[col1, col2] = a_corr
            correlation_matrix.loc[col2, col1] = a_corr 

    correlation_matrix = correlation_matrix.apply(pd.to_numeric, errors='coerce')

    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation Matrix for Adult Dataset')
    plt.savefig("adult_correlations.png", dpi=300, bbox_inches='tight')
    plt.show()



## Configuring the target data for each dataset

In [12]:


X_adult = adult_df.drop(["income"], axis=1)
y_adult = adult_df["income"]

X_breast_cancer = breast_cancer_df.drop(["diagnosis"], axis=1)
y_breast_cancer = breast_cancer_df["diagnosis"]


## Imputing Missingness via Missign Completely At Random (MCAR) mechanism

In [13]:
X_missing = generate_multivariate_missingness(X_breast_cancer, y_breast_cancer.values ,mechanism='MCAR', missing_rate=MISSING_RATE)


Global Missing Rate = 30.00%
Missing values per column:
id                         171
radius_mean                162
texture_mean               165
perimeter_mean             169
area_mean                  175
smoothness_mean            168
compactness_mean           171
concavity_mean             187
concave points_mean        180
symmetry_mean              158
fractal_dimension_mean     183
radius_se                  168
texture_se                 177
perimeter_se               152
area_se                    151
smoothness_se              186
compactness_se             177
concavity_se               158
concave points_se          182
symmetry_se                150
fractal_dimension_se       163
radius_worst               176
texture_worst              169
perimeter_worst            169
area_worst                 206
smoothness_worst           163
compactness_worst          186
concavity_worst            170
concave points_worst       178
symmetry_worst             158
fractal_dimens

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_breast_cancer.values, stratify=y_breast_cancer)

NameError: name 'train_test_split' is not defined

We then split the dataset `X` and labels `y` into training and test sets (`X_train`, `X_test`, `y_train`, `y_test`) while preserving the original class distribution using stratified sampling.

## **Exploring Mean Imputation**

We start with a `SimpleImputer` to fill missing values with the mean, and then train a decision model using the imputed data, and printing the classification report.

In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))

## **Exploring kNN Imputation**

Rather than the mean, we can decide on a `KNNImputer` with `k=5` to fill missing values. Similarly, we train a logistic regression model using the imputed data, and print the classification report.

In [None]:

y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))


              precision    recall  f1-score   support

           0       0.94      0.99      0.96        90
           1       0.98      0.89      0.93        53

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))



              precision    recall  f1-score   support

           0       0.91      0.97      0.94        90
           1       0.94      0.83      0.88        53

    accuracy                           0.92       143
   macro avg       0.92      0.90      0.91       143
weighted avg       0.92      0.92      0.91       143



## Adult Dataset


In [None]:
X_missing = generate_multivariate_missingness(X_adult, y_adult.values ,mechanism='MCAR', missing_rate=MISSING_RATE)

In [20]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93        90
           1       0.86      0.91      0.88        53

    accuracy                           0.91       143
   macro avg       0.90      0.91      0.90       143
weighted avg       0.91      0.91      0.91       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        90
           1       0.91      0.92      0.92        53

    accuracy                           0.94       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92        90
           1       0.84      0.91      0.87        53

    accuracy                           0.90       143
   macro avg       0.89      0.90      0.90       143
weighted avg       0.90      0.90      0.90       143



## Imputing Univariate Missingness via Missign At Random (MAR) mechanism

### Breast Cancer Dataset

In [23]:
X_breast_cancer_ = X_breast_cancer.copy()

In [24]:
'''
correlation_matrix = X_breast_cancer.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap for Breast Cancer Dataset")

# Save the heatmap as an image file
plt.savefig("breast_cancer_correlation_heatmap.png", dpi=300, bbox_inches='tight')

# Display the heatmap
plt.show()
'''

'\ncorrelation_matrix = X_breast_cancer.corr()\n\n# Plot heatmap\nplt.figure(figsize=(12, 8))\nsns.heatmap(correlation_matrix, annot=True, cmap=\'coolwarm\')\nplt.title("Feature Correlation Heatmap for Breast Cancer Dataset")\n\n# Save the heatmap as an image file\nplt.savefig("breast_cancer_correlation_heatmap.png", dpi=300, bbox_inches=\'tight\')\n\n# Display the heatmap\nplt.show()\n'

In [25]:
X_missing = generate_multivariate_missingness(X=X_breast_cancer_,y=y_breast_cancer.values,mechanism='MAR',missing_rate=MISSING_RATE)

Global Missing Rate = 30.00%
Missing values per column:
id                         174
radius_mean                187
texture_mean               175
perimeter_mean             165
area_mean                  168
smoothness_mean            156
compactness_mean           164
concavity_mean             167
concave points_mean        178
symmetry_mean              166
fractal_dimension_mean     176
radius_se                  175
texture_se                 178
perimeter_se               165
area_se                    157
smoothness_se              162
compactness_se             182
concavity_se               164
concave points_se          183
symmetry_se                182
fractal_dimension_se       172
radius_worst               177
texture_worst              160
perimeter_worst            154
area_worst                 180
smoothness_worst           179
compactness_worst          174
concavity_worst            170
concave points_worst       167
symmetry_worst             167
fractal_dimens

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_breast_cancer, stratify=y_breast_cancer)

In [27]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))

              precision    recall  f1-score   support

           0       0.94      0.81      0.87        90
           1       0.74      0.91      0.81        53

    accuracy                           0.85       143
   macro avg       0.84      0.86      0.84       143
weighted avg       0.86      0.85      0.85       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91        90
           1       0.82      0.89      0.85        53

    accuracy                           0.89       143
   macro avg       0.88      0.89      0.88       143
weighted avg       0.89      0.89      0.89       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        90
           1       0.91      0.91      0.91        53

    accuracy                           0.93       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.93      0.93      0.93       143



### Adult Dataset

In [30]:
X_adult_ = X_adult.copy()

In [31]:
#correlations(adult_df)

In [None]:
X_missing = generate_multivariate_missingness(X=X_adult_, y=y_adult.values, mechanism='MAR',missing_rate=MISSING_RATE)

Global Missing Rate = 30.00%
Missing values per column:
age                               14728
fnlwgt                            14833
educational-num                   14667
gender                            14442
capital-gain                      14731
                                  ...  
native-country_Thailand           14710
native-country_Trinadad&Tobago    14679
native-country_United-States      14740
native-country_Vietnam            14590
native-country_Yugoslavia         14624
Length: 100, dtype: int64


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y_adult, stratify=y_adult)

In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      9289
           1       0.52      0.54      0.53      2922

    accuracy                           0.77     12211
   macro avg       0.69      0.69      0.69     12211
weighted avg       0.78      0.77      0.77     12211



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      9289
           1       0.54      0.55      0.54      2922

    accuracy                           0.78     12211
   macro avg       0.70      0.70      0.70     12211
weighted avg       0.78      0.78      0.78     12211



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))

              precision    recall  f1-score   support

           0       0.86      0.83      0.84      9289
           1       0.51      0.56      0.54      2922

    accuracy                           0.77     12211
   macro avg       0.68      0.70      0.69     12211
weighted avg       0.77      0.77      0.77     12211



## Imputing Univariate Missingness via Missign Not At Random (MNAR) mechanism

### Breast Cancer Dataset

In [None]:
X = X_breast_cancer.copy()
y = y_breast_cancer

X_missing = generate_multivariate_missingness(X=X, y=y.values, mechanism='MNAR',missing_rate=MISSING_RATE)

Global Missing Rate = 29.97%
Missing values per column:
id                           0
radius_mean                256
texture_mean                 0
perimeter_mean             256
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean        341
symmetry_mean                0
fractal_dimension_mean       0
radius_se                  341
texture_se                   0
perimeter_se                 0
area_se                    341
smoothness_se              341
compactness_se             341
concavity_se                 0
concave points_se          341
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst              341
perimeter_worst            341
area_worst                 341
smoothness_worst           341
compactness_worst            0
concavity_worst            341
concave points_worst       341
symmetry_worst             341
fractal_dimens

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, stratify=y)

In [39]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92        90
           1       0.84      0.91      0.87        53

    accuracy                           0.90       143
   macro avg       0.89      0.90      0.90       143
weighted avg       0.90      0.90      0.90       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90        90
           1       0.82      0.87      0.84        53

    accuracy                           0.88       143
   macro avg       0.87      0.88      0.87       143
weighted avg       0.88      0.88      0.88       143



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95        90
           1       0.92      0.91      0.91        53

    accuracy                           0.94       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143



### Adult Dataset

In [42]:
X = X_adult.copy()
y = y_adult

In [None]:
X_missing = generate_multivariate_missingness(X=X, y=y.values, mechanism='MNAR',missing_rate=MISSING_RATE)

  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_chunk.loc[pos_xmiss, val] = np.nan
  dataset_

Global Missing Rate = 30.00%
Missing values per column:
age                               29305
fnlwgt                                0
educational-num                   29305
gender                            29305
capital-gain                      29305
                                  ...  
native-country_Thailand               0
native-country_Trinadad&Tobago        0
native-country_United-States          0
native-country_Vietnam                0
native-country_Yugoslavia             0
Length: 100, dtype: int64


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, stratify=y)


In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, SimpleImputer(missing_values=np.nan, strategy="mean"))


              precision    recall  f1-score   support

           0       0.87      0.84      0.86      9289
           1       0.55      0.61      0.58      2922

    accuracy                           0.79     12211
   macro avg       0.71      0.73      0.72     12211
weighted avg       0.80      0.79      0.79     12211



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, KNNImputer(n_neighbors=5))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      9289
           1       0.57      0.58      0.58      2922

    accuracy                           0.79     12211
   macro avg       0.72      0.72      0.72     12211
weighted avg       0.80      0.79      0.80     12211



In [None]:
y_pred, clf_report = run_logistic_regression(X_train, X_test, y_train, y_test, IterativeImputer(max_iter=100))

KeyboardInterrupt: 