In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Nonlinear time-series generator (Dataset)

In [2]:
def generate_medium_difficulty_dataset(
    n_samples=5000,
    n_cont_features=10,
    n_cat_features=5,
    n_classes=3,
    lstm_sequence_length=None,  # optional
    random_state=42
):
    """
    Medium-difficulty synthetic dataset created by:
    - Random nonlinear MLP (hidden truth function)
    - Numeric interactions
    - Useful + useless noise
    - Balanced classes
    - Suitable reshaping for MLP, CNN, LSTM
    """

    torch.manual_seed(random_state)
    np.random.seed(random_state)

    # ============================================================
    # 1. Generate base features
    # ============================================================

    # Continuous base features
    X_cont = np.random.randn(n_samples, n_cont_features)

    # Add nonlinear transforms â†’ adds medium complexity
    X_nonlin = np.column_stack([
        np.sin(X_cont[:, 0]),
        X_cont[:, 1] * X_cont[:, 2],
        np.tanh(X_cont[:, 3]),
        np.exp(-X_cont[:, 4]**2),
        (X_cont[:, 5] > 0).astype(float)
    ])

    # Noise features (useless)
    X_noise = np.random.randn(n_samples, 10)

    # Categorical
    # Slightly imbalanced, but not extreme
    X_cat = np.column_stack([
        np.random.choice([0,1,2,3,4], size=n_samples, p=[0.4,0.2,0.2,0.1,0.1]),
        np.random.choice([0,1,2,3,4], size=n_samples),
        np.random.choice([0,1,2,3,4], size=n_samples),
        np.random.choice([0,1,2,3,4], size=n_samples, p=[0.5,0.1,0.1,0.1,0.2]),
        np.random.choice([0,1,2,3,4], size=n_samples)
    ])


    # Combine everything
    X = np.hstack([X_cont, X_nonlin, X_noise, X_cat])
    total_features = X.shape[1]


    # ============================================================
    # 2. Hidden Random Neural Network to Generate Class Probabilities
    # ============================================================

    hidden_model = nn.Sequential(
        nn.Linear(total_features, 64),
        nn.ReLU(),
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, n_classes)
    )

    for p in hidden_model.parameters():
        p.requires_grad = False

    with torch.no_grad():
        logits = hidden_model(torch.tensor(X, dtype=torch.float32))
        probs = torch.softmax(logits, dim=1).numpy()

    y = np.argmax(probs, axis=1)

    # Balance classes (important)
    # Re-sample to balance the dataset moderately
    final_idx = []
    for c in range(n_classes):
        cls_idx = np.where(y == c)[0]
        n_target = n_samples // n_classes
        if len(cls_idx) > n_target:
            cls_idx = np.random.choice(cls_idx, size=n_target, replace=False)
        else:
            cls_idx = np.random.choice(cls_idx, size=n_target, replace=True)
        final_idx.append(cls_idx)

    final_idx = np.concatenate(final_idx)
    np.random.shuffle(final_idx)

    X = X[final_idx]
    y = y[final_idx]


    # ============================================================
    # 3. Prepare MLP + CNN versions
    # ============================================================

    X_mlp = X.copy()
    X_cnn = X.reshape(X.shape[0], 1, -1)   # (batch, channel=1, features)


    # ============================================================
    # 4. Prepare LSTM Version
    # ============================================================

    if lstm_sequence_length is None:
        # choose a divisor of total_features that gives medium sequence length (not too small)
        divisors = [d for d in range(5, total_features+1) if total_features % d == 0]
        divisors.sort()
        lstm_sequence_length = divisors[len(divisors)//2]  # pick mid-level divisor

    if total_features % lstm_sequence_length != 0:
        raise ValueError("Chosen sequence length doesn't divide features")

    features_per_step = total_features // lstm_sequence_length
    X_lstm = X.reshape(X.shape[0], lstm_sequence_length, features_per_step)

    return X_mlp, X_cnn, X_lstm, y


* MLP --> receives the raw feature matrix X
* CNN --> Receives the same 15 features, but arranged as a single channel: This is just a reshape, no new data is created.
* LSTM --> Receives the same 15 features, but split into time steps: This also contains the same numbers, merely reorganized so the LSTM can process it.

In [3]:
X_mlp, X_cnn, X_lstm, y = generate_medium_difficulty_dataset()

In [4]:
df = pd.DataFrame(X_mlp, columns=[f"f{i}" for i in range(X_mlp.shape[1])])

# Convert categorical columns back to int
for c in [f"f{i}" for i in range(25, 30)]:
    df[c] = df[c].astype(int)

In [5]:
num_cols = [f"f{i}" for i in range(25)]
cat_cols = [f"f{i}" for i in range(25, 30)]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)
print("df columns:", df.columns.tolist())

Numeric cols: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24']
Categorical cols: ['f25', 'f26', 'f27', 'f28', 'f29']
df columns: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29']


In [6]:
df[cat_cols] = df[cat_cols].astype("category")

In [7]:
print(df.dtypes)

f0      float64
f1      float64
f2      float64
f3      float64
f4      float64
f5      float64
f6      float64
f7      float64
f8      float64
f9      float64
f10     float64
f11     float64
f12     float64
f13     float64
f14     float64
f15     float64
f16     float64
f17     float64
f18     float64
f19     float64
f20     float64
f21     float64
f22     float64
f23     float64
f24     float64
f25    category
f26    category
f27    category
f28    category
f29    category
dtype: object


In [8]:
preprocess_S1 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols)
    ],
    remainder='drop'
)

preprocess_S2 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", KNNImputer(n_neighbors=5)),
            ("scaler", MinMaxScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("target", TargetEncoder())
        ]), cat_cols)
    ],
    remainder='drop'
)

identity = FunctionTransformer(lambda x: x)

preprocess_S3 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", IterativeImputer(max_iter=3)),
            ("scaler", RobustScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("identity", identity)
        ]), cat_cols)
    ],
    remainder='drop'
)


In [9]:
for name, preproc in [("S1", preprocess_S1), ("S2", preprocess_S2), ("S3", preprocess_S3)]:
    Xp = preproc.fit_transform(df, y)
    print(name, Xp.shape)

S1 (4998, 50)
S2 (4998, 30)
S3 (4998, 30)


In [10]:
print(X_mlp.shape)
print(X_cnn.shape)
print(X_lstm.shape)
print(y.shape)

(4998, 30)
(4998, 1, 30)
(4998, 10, 3)
(4998,)


In [11]:
df = pd.DataFrame(X_mlp, columns=[f"f{i}" for i in range(X_mlp.shape[1])])

num_cols = [f"f{i}" for i in range(25)]
cat_cols = [f"f{i}" for i in range(25, 30)]

df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29
0,-1.710808,0.308763,2.355629,-0.04254,0.180019,-0.31026,0.667262,0.362209,-0.676047,-0.114353,...,0.959218,-0.110982,-0.357152,1.688971,0.092737,3.0,4.0,1.0,3.0,0.0
1,-1.290487,1.542297,1.15933,-0.103989,-0.488313,-0.609441,-2.585653,0.35382,0.780324,0.137902,...,-0.059107,-2.016945,0.774187,0.08896,-1.60596,0.0,0.0,4.0,0.0,0.0
2,-0.287306,-0.189568,-0.060016,0.385205,-0.000263,-0.133395,1.308918,0.565596,-0.958804,0.740674,...,-0.557216,1.644009,1.646138,0.225626,-0.827511,1.0,3.0,0.0,1.0,4.0
3,-1.290487,1.542297,1.15933,-0.103989,-0.488313,-0.609441,-2.585653,0.35382,0.780324,0.137902,...,-0.059107,-2.016945,0.774187,0.08896,-1.60596,0.0,0.0,4.0,0.0,0.0
4,-1.290487,1.542297,1.15933,-0.103989,-0.488313,-0.609441,-2.585653,0.35382,0.780324,0.137902,...,-0.059107,-2.016945,0.774187,0.08896,-1.60596,0.0,0.0,4.0,0.0,0.0


In [12]:
preprocess_S1 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

preprocess_S2 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", KNNImputer(n_neighbors=5)),
            ("scaler", MinMaxScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("target", TargetEncoder())
        ]), cat_cols)
    ]
)

identity = FunctionTransformer(lambda x: x)

preprocess_S3 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", IterativeImputer(max_iter=3)),
            ("scaler", RobustScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("identity", identity)
        ]), cat_cols)
    ]
)

In [13]:
for name, preproc in [("S1", preprocess_S1), ("S2", preprocess_S2), ("S3", preprocess_S3)]:
    Xp = preproc.fit_transform(df, y)
    print(name, Xp.shape)


S1 (4998, 50)
S2 (4998, 30)
S3 (4998, 30)


In [14]:
print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)
print("df columns:", df.columns.tolist())

Numeric cols: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24']
Categorical cols: ['f25', 'f26', 'f27', 'f28', 'f29']
df columns: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29']


### MLP Experiment (S1, S2 and S3 comparison)

* S1 = MedianImpute â†’ StandardScaler â†’ OneHotEncoder
* S2 = KNNImputer(k=5) â†’ MinMaxScaler â†’ TargetEncode
* S3 = MICE(3 iters) â†’ RobustScaler â†’ Embedding Layer (this will be ignored for MLP)


In [15]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


# ===============================================================
# Utility: Convert sparse matrix â†’ dense
# ===============================================================

def to_dense(X):
    if hasattr(X, "toarray"):
        return X.toarray()
    return X


# ===============================================================
# MLP with Embeddings (Used for S3 Only)
# ===============================================================

class MLP_with_Embeddings(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, embed_dim=8, num_classes=3):
        super().__init__()

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleList([
            nn.Embedding(card, embed_dim) for card in cat_cardinalities
        ])

        # Lazy initialization for fc1
        self.fc1 = None
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def initialize_fc1(self, input_dim):
        if self.fc1 is None:
            self.fc1 = nn.Linear(input_dim, 64)

    def forward(self, x_num, x_cat=None):

        if x_cat is None:
            # S1/S2
            inp = x_num
        else:
            # S3 â†’ Embed categorical IDs
            embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
            embedded = torch.cat(embedded, dim=1)
            inp = torch.cat([x_num, embedded], dim=1)

        # Lazy init
        if self.fc1 is None:
            self.initialize_fc1(inp.shape[1])

        x = torch.relu(self.fc1(inp))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# ===============================================================
# Training + Evaluation
# ===============================================================

def train(model, optimizer, criterion, X_num, X_cat, y, epochs=20):
    model.train()
    for _ in range(epochs):
        optimizer.zero_grad()
        preds = model(X_num, X_cat)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()


def evaluate(model, X_num, X_cat, y):
    model.eval()
    with torch.no_grad():
        preds = model(X_num, X_cat)
        preds = preds.argmax(dim=1)
        acc = accuracy_score(y, preds)
        f1 = f1_score(y, preds, average="weighted")
    return acc, f1


# ===============================================================
# Split numeric + categorical for S3
# ===============================================================

def split_for_strategy(X, strategy, num_numeric, num_categorical):
    X = to_dense(X)

    if strategy in ["S1", "S2"]:
        # Fully numeric
        return torch.tensor(X, dtype=torch.float32), None

    # ---- S3: numeric + categorical IDs ----
    X_num = X[:, :num_numeric]
    X_cat = X[:, num_numeric:num_numeric+num_categorical]

    # Convert to int and fix negative/out-of-range values
    X_cat = X_cat.astype(int)
    X_cat = np.clip(X_cat, 0, 4)   # because generator uses 0â€“4 categories

    return (
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(X_cat, dtype=torch.long)
    )


# ===============================================================
# MLP EXPERIMENT LOOP (S1 / S2 / S3)
# ===============================================================

def run_mlp_experiment():

    results = []

    preprocessors = {
        "S1": preprocess_S1,
        "S2": preprocess_S2,
        "S3": preprocess_S3
    }

    # ---- Load medium-difficulty dataset ----
    X_mlp, _, _, y = generate_medium_difficulty_dataset()

    # Determine numeric + categorical column counts dynamically
    total_cols = X_mlp.shape[1]
    num_categorical = 5                    # fixed by generator
    num_numeric = total_cols - num_categorical

    print(f"\nDetected numeric features: {num_numeric}")
    print(f"Detected categorical features: {num_categorical}")

    # ---- Train/test split ----
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_mlp, y, test_size=0.25, random_state=42
    )

    for strategy, preproc in preprocessors.items():
        print(f"\nðŸš€ Running MLP with {strategy}")

        # Preprocess using sklearn
        X_train_prep = preproc.fit_transform(X_train_raw, y_train)
        X_test_prep = preproc.transform(X_test_raw)

        # Split into numeric + categorical
        X_train_num, X_train_cat = split_for_strategy(
            X_train_prep, strategy,
            num_numeric=num_numeric,
            num_categorical=num_categorical
        )

        X_test_num, X_test_cat = split_for_strategy(
            X_test_prep, strategy,
            num_numeric=num_numeric,
            num_categorical=num_categorical
        )

        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long)

        # Build model
        model = MLP_with_Embeddings(
            num_numeric=num_numeric,
            cat_cardinalities=[5]*num_categorical,  # correct for generator
            embed_dim=8,
            num_classes=3
        )

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        # Train
        train(model, optimizer, criterion,
              X_train_num, X_train_cat, y_train_tensor)

        # Evaluate
        acc, f1 = evaluate(model,
                           X_test_num, X_test_cat, y_test_tensor)

        # Store results
        results.append({
            "Model": "MLP",
            "Preprocessing": strategy,
            "Accuracy": acc,
            "F1 Score": f1
        })

    return pd.DataFrame(results)


In [16]:
results_mlp_df = run_mlp_experiment()
results_mlp_df



Detected numeric features: 25
Detected categorical features: 5

ðŸš€ Running MLP with S1


ValueError: Specifying the columns using strings is only supported for dataframes.

### CNN Experiment (S1, S2 and S3 comparison)

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_features_after_preprocessing, num_classes=3):
        super().__init__()

        # Input: (batch, 1, num_features)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

        # Determine flattened size dynamically
        self.flatten_dim = None

        self.fc1 = None
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def initialize_fc1(self, x):
        # Called once to initialize the fully connected layer
        if self.fc1 is None:
            self.flatten_dim = x.shape[1]
            self.fc1 = nn.Linear(self.flatten_dim, 64)

    def forward(self, x):
        # x shape: (batch, 1, features)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)

        # Flatten
        x = x.view(x.size(0), -1)

        # Lazy initialization of the first FC layer
        if self.fc1 is None:
            self.initialize_fc1(x)

        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
def train_cnn(model, optimizer, criterion, X, y, epochs=20):
    model.train()
    for _ in range(epochs):
        optimizer.zero_grad()
        preds = model(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()


In [None]:
def evaluate_cnn(model, X, y):
    model.eval()
    with torch.no_grad():
        preds = model(X)
        preds = preds.argmax(dim=1)
        acc = accuracy_score(y, preds)
        f1 = f1_score(y, preds, average="weighted")
    return acc, f1


In [None]:
def reshape_for_cnn(X):
    # after preprocessing X is (batch, features)
    # CNN expects: (batch, 1, features)
    return torch.tensor(X, dtype=torch.float32).unsqueeze(1)


In [None]:
results_cnn = []

# Use the original tabular version X_mlp for preprocessing, not X_cnn
X_mlp, X_cnn_raw, X_lstm_raw, y = generate_random_dataset()

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_mlp, y, test_size=0.25, random_state=42
)

for strategy, preproc in preprocessors.items():
    print(f"\nðŸš€ Running CNN with {strategy}")

    # 1. Preprocess tabular input
    X_train_prep = preproc.fit_transform(X_train_raw, y_train)
    X_test_prep = preproc.transform(X_test_raw)

    # 2. Convert to CNN format: (batch, 1, features)
    X_train_tensor = reshape_for_cnn(X_train_prep)
    X_test_tensor = reshape_for_cnn(X_test_prep)

    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    # 3. Build CNN model
    num_features = X_train_prep.shape[1]
    model = SimpleCNN(num_features_after_preprocessing=num_features, num_classes=3)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    # 4. Train
    train_cnn(model, optimizer, criterion, X_train_tensor, y_train_tensor)

    # 5. Evaluate
    acc, f1 = evaluate_cnn(model, X_test_tensor, y_test_tensor)

    # 6. Store results
    results_cnn.append({
        "Model": "CNN",
        "Preprocessing": strategy,
        "Accuracy": acc,
        "F1 Score": f1
    })

results_cnn_df = pd.DataFrame(results_cnn)
results_cnn_df
