In [1]:
import pandas as pd
import numpy as np

# Load datasets
training_df = pd.read_csv('model_ready_data/training_data.csv')
testing_df  = pd.read_csv('model_ready_data/testing_data.csv')

In [2]:
_mapping = {'<30': 1, 'NO': 0, '>30': 0}
training_df['readmitted'] = training_df['readmitted'].map(_mapping).astype(int)
testing_df ['readmitted'] = testing_df ['readmitted'].map(_mapping).astype(int)

In [3]:
# Training split
y_train = training_df['readmitted'].values.astype(np.float64)
X_train = training_df.drop(columns=['readmitted']).values.astype(np.float64)

# Test split
y_test  = testing_df ['readmitted'].values.astype(np.float64)
X_test  = testing_df .drop(columns=['readmitted']).values.astype(np.float64)

In [4]:
# Standardize: zero mean, unit variance
means = X_train.mean(axis=0)
stds  = X_train.std(axis=0)
stds[stds == 0] = 1.0    # avoid division by zero if any feature is constant

X_train = (X_train - means) / stds
X_test  = (X_test  - means) / stds

In [5]:
# 5) Balance the training set

# (a) Re-create a DataFrame so we can sample by class
try:
    feature_names
except NameError:
    feature_names = [f"f{i}" for i in range(X_train.shape[1])]

train_df = pd.DataFrame(X_train, columns=feature_names)
train_df['readmitted'] = y_train

# Separate majority/minority
minority = train_df[train_df.readmitted == 1]
majority = train_df[train_df.readmitted == 0]

# —— Random Oversampling —— 
minority_os = minority.sample(n=len(majority),
                              replace=True,
                              random_state=42)
oversampled_df = pd.concat([majority, minority_os]).sample(frac=1,
                                                           random_state=42)

X_train_over = oversampled_df.drop('readmitted', axis=1).values
y_train_over = oversampled_df['readmitted'].values

print("After oversampling:", np.bincount(y_train_over.astype(int)))

# —— Random Undersampling —— 
majority_us = majority.sample(n=len(minority),
                              replace=False,
                              random_state=42)
undersampled_df = pd.concat([minority, majority_us]).sample(frac=1,
                                                            random_state=42)

X_train_under = undersampled_df.drop('readmitted', axis=1).values
y_train_under = undersampled_df['readmitted'].values

print("After undersampling:", np.bincount(y_train_under.astype(int)))

After oversampling: [62055 62055]
After undersampling: [7945 7945]


In [6]:
n_features    = X_train.shape[1]
weights       = np.zeros(n_features)
bias          = 0.0
learning_rate = 0.01

In [7]:
def sigmoid(z):
    """
    Compute sigmoid in a numerically stable way.
    """
    z = np.array(z, dtype=np.float64)
    out = np.empty_like(z)
    pos = z >= 0
    out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
    neg = ~pos
    exp_z = np.exp(z[neg])
    out[neg] = exp_z / (1.0 + exp_z)
    return out

In [8]:
def compute_cost(y, y_pred):
    """
    Binary cross-entropy loss with clipping to avoid log(0).
    """
    eps = 1e-9
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

In [9]:
def gradient_descent(X, y, weights, bias, lr, num_iters):
    """
    Batch gradient descent for logistic regression.
    Prints cost at iteration 1 and then 4 more evenly spaced times.
    """
    m = len(y)
    for i in range(1, num_iters + 1):
        # Forward pass
        z      = X.dot(weights) + bias
        y_pred = sigmoid(z)
        # Gradients
        dz = y_pred - y
        dw = (X.T.dot(dz)) / m
        db = dz.sum() / m
        # Parameter update
        weights -= lr * dw
        bias    -= lr * db
        # Monitor cost
        if i == 1 or i % (num_iters // 5) == 0:
            cost = compute_cost(y, y_pred)
            print(f"Iteration {i}/{num_iters}, Cost: {cost:.6f}")
    return weights, bias

In [10]:
# 9a) Train on original data
print("Training on ORIGINAL data:")
w_orig, b_orig = gradient_descent(X_train, y_train,
                                  np.zeros_like(weights), 0.0,
                                  learning_rate, num_iters=5000)

# 9b) Train on OVERSAMPLED data
print("\nTraining on OVERSAMPLED data:")
w_os, b_os = gradient_descent(X_train_over, y_train_over,
                              np.zeros_like(weights), 0.0,
                              learning_rate, num_iters=10000)

# 9c) Train on UNDERSAMPLED data
print("\nTraining on UNDERSAMPLED data:")
w_us, b_us = gradient_descent(X_train_under, y_train_under,
                              np.zeros_like(weights), 0.0,
                              learning_rate, num_iters=10000)

Training on ORIGINAL data:
Iteration 1/5000, Cost: 0.693147
Iteration 1000/5000, Cost: 0.350178
Iteration 2000/5000, Cost: 0.334408
Iteration 3000/5000, Cost: 0.331956
Iteration 4000/5000, Cost: 0.331331
Iteration 5000/5000, Cost: 0.331101

Training on OVERSAMPLED data:
Iteration 1/10000, Cost: 0.693147
Iteration 2000/10000, Cost: 0.635277
Iteration 4000/10000, Cost: 0.634594
Iteration 6000/10000, Cost: 0.634388
Iteration 8000/10000, Cost: 0.634290
Iteration 10000/10000, Cost: 0.634232

Training on UNDERSAMPLED data:
Iteration 1/10000, Cost: 0.693147
Iteration 2000/10000, Cost: 0.634142
Iteration 4000/10000, Cost: 0.633439
Iteration 6000/10000, Cost: 0.633223
Iteration 8000/10000, Cost: 0.633120
Iteration 10000/10000, Cost: 0.633060


In [11]:
# Evaluate on training set
train_prob = sigmoid(X_train.dot(weights) + bias)
train_pred = (train_prob >= 0.5).astype(int)
train_acc  = (train_pred == y_train).mean()
train_cost = compute_cost(y_train, train_prob)

# Evaluate on test set
test_prob  = sigmoid(X_test.dot(weights) + bias)
test_pred  = (test_prob >= 0.5).astype(int)
test_acc   = (test_pred == y_test).mean()
test_cost  = compute_cost(y_test, test_prob)

def evaluate(X, y, weights, bias, label):
    prob = sigmoid(X.dot(weights) + bias)
    pred = (prob >= 0.5).astype(int)
    acc  = (pred == y).mean()
    cost = compute_cost(y, prob)
    print(f"{label:12s} → Acc: {acc:.3f}, Cost: {cost:.6f}")

print("\n=== EVALUATION ===")
evaluate(X_train,      y_train,w_orig, b_orig, "Orig Train")
evaluate(X_test,       y_test,w_orig, b_orig, "Orig  Test")
evaluate(X_train_over, y_train_over, w_os,b_os,   "Over Train")
evaluate(X_test,       y_test,       w_os,   b_os,   "Over  Test")
evaluate(X_train_under,y_train_under,w_us,   b_us,   "Under Train")
evaluate(X_test,       y_test,       w_us,   b_us,   "Under Test")


=== EVALUATION ===
Orig Train   → Acc: 0.886, Cost: 0.331101
Orig  Test   → Acc: 0.892, Cost: 0.326023
Over Train   → Acc: 0.636, Cost: 0.634232
Over  Test   → Acc: 0.628, Cost: 0.672468
Under Train  → Acc: 0.636, Cost: 0.633060
Under Test   → Acc: 0.620, Cost: 0.679134
