In [8]:
# plan for simulated dataset
# * 5 numerical variables
# * 2 categorical variables
# * add missingness to x2, x5, cat1

In [9]:
import numpy as np
import pandas as pd

In [10]:
def generate_X(n = 4000, seed = 404):
    rng = np.random.default_rng(seed)

    #numeric features 
    x1 = rng.normal(0, 1, size = n)
    x2 = rng.normal(5 , 2, size = n)
    x3 = rng.exponential(1, size = n)
    x4 = rng.uniform(-3, 3, size = n)
    x5 = rng.lognormal(0, 1, size = n)

    #Categorical features
    cat1_levels = np.array(["A", "B", "C"]) 
    cat2_levels = np.array(["Low", "Medium", "High"])

    # Choice for probabilities of each level 
    # We want to check for categorical variables that are imbalanced, if we were to choose all three levels 
    # with 1/3 probability 
    # it would be super easy for our neural network to learn. 
    # So one categorical variable with mild imbalance, and the other with a higher imbalance

    cat1 = rng.choice(cat1_levels, size=n, p=[0.4, 0.3, 0.3])
    cat2 = rng.choice(cat2_levels, size=n, p=[0.3, 0.5, 0.2])

    df_X = pd.DataFrame({
        "x1": x1, "x2": x2, "x3": x3, "x4": x4, "x5": x5,
        "cat1": cat1, "cat2": cat2
    })

    return df_X

In [11]:
df = generate_X()
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,cat1,cat2
0,0.39633,1.776594,0.172009,0.724065,0.321331,C,High
1,-0.616061,5.384115,7.391653,-2.529603,1.514568,B,Medium
2,0.456879,5.646006,2.594144,1.163738,1.638706,A,Medium
3,0.892955,4.946917,0.106609,-1.498865,2.793379,A,Medium
4,0.299435,4.02081,0.431798,1.189655,2.131191,A,High


In [12]:
def f_true(df_X):
    x1 = df_X["x1"].to_numpy()
    x2 = df_X["x2"].to_numpy()
    x3 = df_X["x3"].to_numpy()
    x4 = df_X["x4"].to_numpy()
    x5 = df_X["x5"].to_numpy()
    cat1 = df_X["cat1"].to_numpy()
    cat2 = df_X["cat2"].to_numpy()

    y_true = (
        5
        + 2 * x1
        - 1.5 * x2
        + 0.8 * (x3 ** 2)
        + 1.2 * np.sin(x4)
        + 0.3 * x5
    )
    y_true += (cat1 == "B") * 0.5
    y_true += (cat1 == "C") * 1.0
    y_true += (cat2 == "High") * 0.8
    y_true += (cat2 == "Low") * (-0.5)

    return y_true

In [13]:
import numpy as np
import pandas as pd

def simulate_dataset(n=4000, target_snr=10.0, seed=404):
    rng = np.random.default_rng(seed)
    
    # 1. Generate predictors
    df_X = generate_X(n=n, seed=seed)
    
    # 2. Compute true signal
    y_true = f_true(df_X)
    
    # 3. Compute noise SD from target SNR
    var_signal = np.var(y_true)
    noise_var = var_signal / target_snr
    noise_sd = np.sqrt(noise_var)
    
    # 4. Generate noise and observed Y
    eps = rng.normal(0, noise_sd, size=n)
    y = y_true + eps
    
    df = df_X.copy()
    df["Y"] = y
    
    # 5. Inject missingness so imputation matters
    
    # x2: 20% missing at random
    mask_x2 = rng.random(n) < 0.20
    df.loc[mask_x2, "x2"] = np.nan
    
    # x5: 25% missing at random
    mask_x5 = rng.random(n) < 0.25
    df.loc[mask_x5, "x5"] = np.nan
    
    # cat1: MAR-ish missingness depending on Y (more missing when Y is large)
    prob_missing_cat1 = 0.10 + 0.20 * (y > y.mean())  # between 0.1 and 0.3
    mask_cat1 = rng.random(n) < prob_missing_cat1
    df.loc[mask_cat1, "cat1"] = np.nan
    
    return df, y_true, eps, noise_sd


In [14]:
df, y_true, eps, noise_sd = simulate_dataset(n=4000, target_snr=10)

print(df.head())
print("\nMissingness rates:")
print(df.isna().mean())

snr_emp = np.var(y_true) / np.var(eps)
print("\nEmpirical SNR ≈", snr_emp)
print("Noise SD used:", noise_sd)


         x1        x2        x3        x4        x5 cat1    cat2          Y
0  0.396330       NaN  0.172009  0.724065  0.321331    C    High   6.543090
1 -0.616061  5.384115  7.391653 -2.529603  1.514568    B  Medium  38.577295
2  0.456879       NaN  2.594144  1.163738  1.638706    A  Medium   5.229301
3  0.892955  4.946917  0.106609 -1.498865       NaN  NaN  Medium   0.593632
4  0.299435       NaN  0.431798  1.189655  2.131191    A    High   2.799175

Missingness rates:
x1      0.00000
x2      0.19825
x3      0.00000
x4      0.00000
x5      0.24475
cat1    0.19150
cat2    0.00000
Y       0.00000
dtype: float64

Empirical SNR ≈ 9.707665933230627
Noise SD used: 1.7670413745793778


### Preprocessing for MLP (low, medium, high)

In [15]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import TargetEncoder

In [16]:
numerical_features = ['x1', 'x2', 'x3', 'x4', 'x5']
cat_features = ['cat1', 'cat2']

# S1 = MedianImpute --> StandardScaler --> One-Hot
numeric_transformer = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
cat_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor1 = make_column_transformer(
    (numeric_transformer, numerical_features),
    (cat_transformer, cat_features),
)

In [17]:
preprocessor1

In [20]:
X_train_pp1 = preprocessor1.fit_transform(df)
column_names1 = preprocessor1.get_feature_names_out()
print(column_names1)

['pipeline__x1' 'pipeline__x2' 'pipeline__x3' 'pipeline__x4'
 'pipeline__x5' 'onehotencoder__cat1_A' 'onehotencoder__cat1_B'
 'onehotencoder__cat1_C' 'onehotencoder__cat1_nan'
 'onehotencoder__cat2_High' 'onehotencoder__cat2_Low'
 'onehotencoder__cat2_Medium']


In [22]:
pd.DataFrame(X_train_pp1, columns=column_names1)

Unnamed: 0,pipeline__x1,pipeline__x2,pipeline__x3,pipeline__x4,pipeline__x5,onehotencoder__cat1_A,onehotencoder__cat1_B,onehotencoder__cat1_C,onehotencoder__cat1_nan,onehotencoder__cat2_High,onehotencoder__cat2_Low,onehotencoder__cat2_Medium
0,0.393324,-0.001217,-0.816826,0.415944,-0.652792,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.604160,0.212148,6.216977,-1.471631,0.024661,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.452982,-0.001217,1.542961,0.671015,0.095140,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.882636,-0.036119,-0.880542,-0.873661,-0.272227,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.297856,-0.001217,-0.563724,0.686051,0.374745,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.181615,-0.510069,2.842892,1.519285,0.800046,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3996,0.060665,0.362505,-0.928062,-0.273650,0.041232,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3997,1.113509,1.124341,-0.790517,-0.660016,-0.023123,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3998,-0.875631,0.734026,2.292433,-0.669450,-0.272227,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
MLP_results = {}
