In [26]:
import os
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.preprocessing import MinMaxScaler

# -------------------- SCALING FUNCTION --------------------
def df_scaler(df):
    label = df[df.columns[-1:]]
    features = df[df.columns[0:-1]]

    numeric_cols = features.select_dtypes(include=[np.number]).columns
    non_numeric_cols = features.select_dtypes(exclude=[np.number]).columns

    scaler = MinMaxScaler()
    scaled_numeric = pd.DataFrame(scaler.fit_transform(features[numeric_cols]), columns=numeric_cols)

    df_scaled = pd.concat([scaled_numeric, features[non_numeric_cols].reset_index(drop=True)], axis=1)
    df_scaled[df_scaled.columns[-1:]] = label.values
    return df_scaled

# -------------------- MISSINGNESS FUNCTIONS --------------------
def induce_mcar(df, missing_fraction, seed=None):
    df_missing = df.copy()
    np.random.seed(seed)
    mask = np.random.rand(*df.shape) < missing_fraction
    df_missing[mask] = np.nan
    return df_missing

def induce_mar(df, missing_fraction, seed=None):
    np.random.seed(seed)
    df_missing = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            continue
        helper_cols = [c for c in df.columns if c != col and df[c].dtype != 'object']
        if not helper_cols:
            continue
        helper_col = np.random.choice(helper_cols)
        threshold = df[helper_col].median()
        high_mask = df[helper_col] > threshold
        eligible_indices = df[high_mask].index.tolist()
        n_missing = int(missing_fraction * len(df))
        n_missing = min(n_missing, len(eligible_indices))
        missing_indices = np.random.choice(eligible_indices, n_missing, replace=False)
        df_missing.loc[missing_indices, col] = np.nan
    return df_missing

def induce_mnar(df, missing_fraction, seed=None):
    np.random.seed(seed)
    df_missing = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            continue
        col_values = df[col].dropna()
        if col_values.empty:
            continue
        min_val, max_val = col_values.min(), col_values.max()
        threshold = np.random.uniform(min_val, max_val)
        high_mask = df[col] > threshold
        eligible_indices = df[high_mask].index.tolist()
        n_missing = int(missing_fraction * len(df))
        n_missing = min(n_missing, len(eligible_indices))
        missing_indices = np.random.choice(eligible_indices, n_missing, replace=False)
        df_missing.loc[missing_indices, col] = np.nan
    return df_missing

# -------------------- UTILITIES --------------------
def save_missing_versions(df, base_path, base_name):
    os.makedirs(base_path, exist_ok=True)

    # Save ground truth only if not already saved
    ground_truth_path = os.path.join(base_path, f"{base_name}_ground_truth.csv")
    if not os.path.exists(ground_truth_path):
        df.to_csv(ground_truth_path, index=False)

    for method, inducer in {'MCAR': induce_mcar, 'MAR': induce_mar, 'MNAR': induce_mnar}.items():
        for frac in [0.1, 0.2, 0.3, 0.4, 0.5]:
            method_dir = os.path.join(base_path, method)
            os.makedirs(method_dir, exist_ok=True)
            filename = f"{base_name}_{method}_{frac}.csv"
            filepath = os.path.join(method_dir, filename)
            if not os.path.exists(filepath):
                df_missing = inducer(df, frac, seed=42)
                df_missing.to_csv(filepath, index=False)

# -------------------- LOAD AND PROCESS --------------------
# Format: (df, short name, data type)
datasets = []

# NUMERICAL
eeg_df = pd.DataFrame(arff.loadarff('phplE7q6h.arff')[0]).drop('Class', axis=1)
datasets.append((eeg_df, 'eeg', 'numerical'))

qsar_df = pd.DataFrame(arff.loadarff('phpGUrE90.arff')[0]).drop('Class', axis=1)
datasets.append((qsar_df, 'qsar', 'numerical'))

arcene_df = pd.read_csv('arcene.csv')
arcene_df = arcene_df[arcene_df.columns[0:100]]
datasets.append((arcene_df, 'arcene', 'numerical'))

# CATEGORICAL
mushroom_df = pd.read_csv('mushroom.csv')
datasets.append((mushroom_df, 'mushroom', 'categorical'))

car_df = pd.read_csv('car_evaluation.csv')
datasets.append((car_df, 'car', 'categorical'))

nursery_df = pd.read_csv('nursery.csv')
datasets.append((nursery_df, 'nursery', 'categorical'))

# MIXED
student_df = pd.read_csv('student-por.csv', sep=';')
datasets.append((student_df, 'student', 'mixed'))

credit_df = pd.read_excel('default of credit card clients.xls', index_col=0, header=1)
datasets.append((credit_df, 'credit', 'mixed'))

heart_df = pd.read_csv('framingham.csv')
datasets.append((heart_df, 'heart', 'mixed'))



datasets = [(arcene_df, 'arcene', 'numerical')]

# -------------------- PROCESS AND SAVE --------------------
for df, name, dtype in datasets:
    base_path = os.path.join('experimental_data', dtype, name)
    if dtype in ['numerical', 'categorical'] and os.path.exists(base_path):
        print(f"Skipping already processed dataset: {name}")
        continue

    if dtype in ['numerical', 'mixed']:
        df = df.dropna(axis=0)  # Drop rows with NaNs before scaling
        df = df_scaler(df)

    save_missing_versions(df, base_path, name)
    print("{} complete.".format(name))

arcene complete.
