# Missing dataset

**Objective**

- To test statistical methods in classifying missing dataset to MAR, MNAR, MCAR.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

def test_for_mar(df, target_col, alpha=0.05):
    df = df.copy()
    df["is_missing"] = df[target_col].isna().astype(int)

    results = {}
    for col in df.columns.drop([target_col, "is_missing"]):
        x = df[col]
        if pd.api.types.is_numeric_dtype(x):
            # Compare means: missing vs not-missing
            group1 = x[df["is_missing"] == 1]
            group0 = x[df["is_missing"] == 0]
            if len(group1) > 1 and len(group0) > 1:
                t, p = stats.ttest_ind(group1, group0, equal_var=False, nan_policy="omit")
                results[col] = {"test": "t-test", "p": p}
        else:
            # Categorical predictor
            tbl = pd.crosstab(df["is_missing"], x)
            chi2, p, _, _ = stats.chi2_contingency(tbl)
            results[col] = {"test": "chi2", "p": p}

    # mark significance
    sig = [col for col, res in results.items() if res["p"] < alpha]
    return {"col_tested": target_col,
            "n_missing": int(df["is_missing"].sum()),
            "significant_predictors": sig,
            "all_tests": results}

In [None]:
def mock_fake_mar(n=1000):
    np.random.seed(0)
    df = pd.DataFrame({
        "age": np.random.randint(18, 70, size=n),
        "income": np.random.normal(50000, 15000, size=n),
         "country": np.random.choice(["US", "UK", "CA"], size=n)
    })
    df.loc[df["age"] < 30, "income"] = np.nan  # induce MAR
    return df


def mock_fake_mcar(n=1000, missing_rate=0.2):
    np.random.seed(0)
    df = pd.DataFrame({
        "age": np.random.randint(18, 70, size=n),
        "income": np.random.normal(50000, 15000, size=n),
        "country": np.random.choice(["US", "UK", "CA"], size=n)
    })

    missing_indices = np.random.choice(df.index, size=int(n * missing_rate), replace=False)
    df.loc[missing_indices, "income"] = np.nan  # induce MCAR
    return df

In [19]:
import numpy as np

# Fake dataset: income missing more often when age < 30
np.random.seed(0)

mar_df = mock_fake_mar()
mcar_df = mock_fake_mcar()

true_mar_result = test_for_mar(mar_df, "income")
mcar_result = test_for_mar(mcar_df, "income")  # should show no significant predictors

print("MAR", true_mar_result)
print("MCAR", mcar_result)

MAR {'col_tested': 'income', 'n_missing': 244, 'significant_predictors': ['age'], 'all_tests': {'age': {'test': 't-test', 'p': 8.164484258141333e-309}, 'country': {'test': 'chi2', 'p': 0.8414869783893579}}}
MCAR {'col_tested': 'income', 'n_missing': 200, 'significant_predictors': [], 'all_tests': {'age': {'test': 't-test', 'p': 0.5159530611478425}, 'country': {'test': 'chi2', 'p': 0.08099854947030145}}}


In [17]:
test_for_mar(mcar_df, "income")  # should show no significant predictors

{'col_tested': 'income',
 'n_missing': 200,
 'significant_predictors': [],
 'all_tests': {'age': {'test': 't-test', 'p': 0.17011817260280057}}}