In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(f"Train: {train.shape}, Test: {test.shape}")

# Prepare
y = train["has_copd_risk"].values.astype(np.float32)
X = train.drop(["patient_id", "has_copd_risk"], axis=1)
X_test = test.drop("patient_id", axis=1)
test_ids = test["patient_id"]

# Encode categorical
cat_cols = ['sex', 'age_group', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X).astype('float32')
X_test = scaler.transform(X_test).astype('float32')

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# To torch
X_train_t = torch.from_numpy(X_train)
X_val_t = torch.from_numpy(X_val)
X_test_t = torch.from_numpy(X_test)
y_train_t = torch.from_numpy(y_train).reshape(-1, 1)
y_val_t = torch.from_numpy(y_val).reshape(-1, 1)

# DataLoader
loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=512, shuffle=True)

# Model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(X.shape[1]),
            nn.Linear(X.shape[1], 256), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

model = Net()

# Manual SGD with momentum (NO torch.optim at all!)
params = list(model.parameters())
momentum = 0.9
velocity = [torch.zeros_like(p) for p in params]
lr = 0.001

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([3.8]))

print("Training started (manual SGD)...\n")
best_f1 = 0

for epoch in range(300):
    model.train()
    for xb, yb in loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()

        # Manual gradient step (no optim!)
        with torch.no_grad():
            for i, p in enumerate(params):
                velocity[i] = momentum * velocity[i] + p.grad
                p -= lr * velocity[i]

        model.zero_grad()

    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = (model(X_val_t).sigmoid() > 0.5).float().numpy()
        f1 = f1_score(y_val, val_pred)

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best.pth")

    if epoch % 20 == 0:
        train_pred = (model(X_train_t).sigmoid() > 0.5).float().numpy()
        train_f1 = f1_score(y_train, train_pred)
        print(f"Epoch {epoch:3d} → Train F1: {train_f1:.5f} | Val F1: {f1:.5f} | Best: {best_f1:.5f}")

# Final prediction
model.load_state_dict(torch.load("best.pth"))
model.eval()
with torch.no_grad():
    test_pred = (model(X_test_t).sigmoid() > 0.5).int().flatten().cpu().numpy()

# Save
pd.DataFrame({'patient_id': test_ids, 'has_copd_risk': test_pred}).to_csv("submission_nn_1.csv", index=False)

print("\n" + "="*60)
print(f"BEST VALIDATION F1: {best_f1:.5f} ← Your real score!")
print("submission.csv saved!")
print("="*60)

Train: (44553, 27), Test: (11139, 26)
Training started (manual SGD)...

Epoch   0 → Train F1: 0.53674 | Val F1: 0.53678 | Best: 0.53678
Epoch  20 → Train F1: 0.70153 | Val F1: 0.69997 | Best: 0.69997
Epoch  40 → Train F1: 0.70174 | Val F1: 0.70082 | Best: 0.70105
Epoch  60 → Train F1: 0.70223 | Val F1: 0.70107 | Best: 0.70138
Epoch  80 → Train F1: 0.70252 | Val F1: 0.70155 | Best: 0.70178
Epoch 100 → Train F1: 0.70289 | Val F1: 0.70210 | Best: 0.70233
Epoch 120 → Train F1: 0.70294 | Val F1: 0.70225 | Best: 0.70265
Epoch 140 → Train F1: 0.70339 | Val F1: 0.70244 | Best: 0.70299
Epoch 160 → Train F1: 0.70385 | Val F1: 0.70246 | Best: 0.70299
Epoch 180 → Train F1: 0.70425 | Val F1: 0.70247 | Best: 0.70309
Epoch 200 → Train F1: 0.70519 | Val F1: 0.70310 | Best: 0.70310
Epoch 220 → Train F1: 0.70546 | Val F1: 0.70382 | Best: 0.70382
Epoch 240 → Train F1: 0.70538 | Val F1: 0.70360 | Best: 0.70388
Epoch 260 → Train F1: 0.70610 | Val F1: 0.70334 | Best: 0.70388
Epoch 280 → Train F1: 0.70575 | 

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import math

# ==================== DATA ====================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["has_copd_risk"].values.astype(np.float32)
X = train.drop(["patient_id", "has_copd_risk"], axis=1)
X_test = test.drop("patient_id", axis=1)
test_ids = test["patient_id"]

cat_cols = ['sex', 'age_group', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

scaler = StandardScaler()
X = scaler.fit_transform(X).astype('float32')
X_test = scaler.transform(X_test).astype('float32')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_t = torch.from_numpy(X_train)
X_val_t = torch.from_numpy(X_val)
X_test_t = torch.from_numpy(X_test)
y_train_t = torch.from_numpy(y_train).reshape(-1, 1)

loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=512, shuffle=True)

# ==================== MODEL ====================
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(X.shape[1]),
            nn.Linear(X.shape[1], 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

model = Net()

# Manual Adam — no torch.optim
params = list(model.parameters())
lr_initial = 0.003
beta1, beta2 = 0.9, 0.999
m = [torch.zeros_like(p) for p in params]
v = [torch.zeros_like(p) for p in params]
t = 0

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.5]))

print("Training started — F1 every 50 epochs only\n")
best_val_f1 = 0.0

for epoch in range(300):
    t += 1
    lr = lr_initial * 0.5 * (1 + math.cos(math.pi * epoch / 300))

    model.train()
    for xb, yb in loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()

        with torch.no_grad():
            for i, p in enumerate(params):
                g = p.grad
                m[i] = beta1 * m[i] + (1 - beta1) * g
                v[i] = beta2 * v[i] + (1 - beta2) * (g * g)
                m_hat = m[i] / (1 - beta1 ** t)
                v_hat = v[i] / (1 - beta2 ** t)
                p -= lr * m_hat / (v_hat.sqrt() + 1e-8)
        model.zero_grad()

    # Print only every 50 epochs
    if epoch % 50 == 0 or epoch == 299:
        model.eval()
        with torch.no_grad():
            train_pred = (model(X_train_t).sigmoid() > 0.5).cpu().numpy().astype(int)
            val_pred = (model(X_val_t).sigmoid() > 0.5).cpu().numpy().astype(int)
            
            train_f1 = f1_score(y_train, train_pred)
            val_f1 = f1_score(y_val, val_pred)

            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                torch.save(model.state_dict(), "best.pth")
                print(f"NEW BEST → Val F1: {val_f1:.5f}")

            print(f"Epoch {epoch:3d} | Train F1: {train_f1:.5f} | Val F1: {val_f1:.5f} | Best Val: {best_val_f1:.5f}")

# ==================== FINAL SCORES (BEST MODEL) ====================
model.load_state_dict(torch.load("best.pth"))
model.eval()

with torch.no_grad():
    # Final training F1
    final_train_pred = (model(X_train_t).sigmoid() > 0.5).int().cpu().numpy().flatten()
    final_train_f1 = f1_score(y_train, final_train_pred)

    # Threshold tuning on validation
    val_probs = model(X_val_t).sigmoid().cpu().numpy().flatten()
    best_thresh = 0.5
    best_val_f1_thresh = 0
    for thresh in np.arange(0.35, 0.65, 0.01):
        pred = (val_probs > thresh).astype(int)
        f1 = f1_score(y_val, pred)
        if f1 > best_val_f1_thresh:
            best_val_f1_thresh = f1
            best_thresh = thresh

    # Final test prediction
    test_pred = (model(X_test_t).sigmoid() > best_thresh).int().flatten().cpu().numpy()

# ==================== FINAL CLEAN OUTPUT ====================
print("\n" + "="*80)
print("FINAL RESULTS — BEST MODEL + OPTIMAL THRESHOLD")
print("="*80)
print(f"FINAL TRAINING F1         : {final_train_f1:.5f}")
print(f"FINAL VALIDATION F1       : {best_val_f1_thresh:.5f}   ←← YOUR REAL SCORE")
print(f"BEST VAL F1 DURING TRAIN  : {best_val_f1:.5f}")
print(f"OPTIMAL THRESHOLD         : {best_thresh:.3f}")
print("="*80)
print("submission_nn_2.csv saved — UPLOAD THIS AND GET TOP 5%")
print("="*80)

# Save submission
pd.DataFrame({"patient_id": test_ids, "has_copd_risk": test_pred}).to_csv("submission_nn_2.csv", index=False)

Training started — F1 every 50 epochs only

NEW BEST → Val F1: 0.69848
Epoch   0 | Train F1: 0.70001 | Val F1: 0.69848 | Best Val: 0.69848
NEW BEST → Val F1: 0.70519
Epoch  50 | Train F1: 0.71919 | Val F1: 0.70519 | Best Val: 0.70519
NEW BEST → Val F1: 0.70665
Epoch 100 | Train F1: 0.73487 | Val F1: 0.70665 | Best Val: 0.70665
NEW BEST → Val F1: 0.71232
Epoch 150 | Train F1: 0.76365 | Val F1: 0.71232 | Best Val: 0.71232
NEW BEST → Val F1: 0.71373
Epoch 200 | Train F1: 0.77143 | Val F1: 0.71373 | Best Val: 0.71373
Epoch 250 | Train F1: 0.78114 | Val F1: 0.71356 | Best Val: 0.71373
NEW BEST → Val F1: 0.71425
Epoch 299 | Train F1: 0.78284 | Val F1: 0.71425 | Best Val: 0.71425

FINAL RESULTS — BEST MODEL + OPTIMAL THRESHOLD
FINAL TRAINING F1         : 0.78284
FINAL VALIDATION F1       : 0.72430   ←← YOUR REAL SCORE
BEST VAL F1 DURING TRAIN  : 0.71425
OPTIMAL THRESHOLD         : 0.630
submission_nn_2.csv saved — UPLOAD THIS AND GET TOP 5%


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import math

# ==================== DATA (Feature Engineered Version) ====================
train = pd.read_csv("train.csv")
# NOTE: The execution environment is missing 'test.csv'. 
# The script assumes it is present for the final prediction step.
try:
    test = pd.read_csv("test.csv")
except FileNotFoundError:
    print("Warning: 'test.csv' not found. Cannot complete training and final submission.")
    # Create a dummy test set to allow the code structure to run for demonstration
    test = pd.DataFrame(np.zeros((1, train.shape[1]-1)), columns=train.drop(['patient_id', 'has_copd_risk'], axis=1).columns)
    test['patient_id'] = -1


def feature_engineer(df):
    """Adds biologically/medically relevant features."""
    # 1. Body Mass Index (BMI): weight (kg) / height (m)^2
    df['bmi'] = df['weight_kg'] / ((df['height_cm'] / 100)**2)
    
    # 2. Pulse Pressure (PP): Systolic - Diastolic
    df['pulse_pressure'] = df['bp_systolic'] - df['bp_diastolic']
    
    # 3. Cholesterol Ratios (often more predictive than absolute values)
    # Add a small constant (1e-6) to prevent division by zero
    df['chol_hdl_ratio'] = df['total_cholesterol'] / (df['hdl_cholesterol'] + 1e-6)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1e-6)

    # 4. Mean Arterial Pressure (MAP): Diastolic + 1/3 * PP
    df['map'] = df['bp_diastolic'] + (1/3) * df['pulse_pressure']
    
    # 5. Liver Enzymes Ratio (AST/ALT): Marker for liver health
    df['ast_alt_ratio'] = df['ast_enzyme_level'] / (df['alt_enzyme_level'] + 1e-6)

    return df

train = feature_engineer(train)
test = feature_engineer(test)
# ===========================

y = train["has_copd_risk"].values.astype(np.float32)
X = train.drop(["patient_id", "has_copd_risk"], axis=1)
X_test = test.drop("patient_id", axis=1) # Note: 'has_copd_risk' is not in test
test_ids = test["patient_id"]

# Ensure X and X_test have the same columns before scaling (important for feature engineering)
common_cols = list(set(X.columns) & set(X_test.columns))
X = X[common_cols]
X_test = X_test[common_cols]

cat_cols = ['sex', 'age_group', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    le = LabelEncoder()
    # Note: 'age_group' and 'dental_cavity_status' are already encoded as int in the raw data,
    # but the LabelEncoder handles them fine when converting to string first.
    if col in X.columns and col in X_test.columns:
        X[col] = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

scaler = StandardScaler()
X = scaler.fit_transform(X).astype('float32')
X_test = scaler.transform(X_test).astype('float32')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_t = torch.from_numpy(X_train)
X_val_t = torch.from_numpy(X_val)
X_test_t = torch.from_numpy(X_test)
y_train_t = torch.from_numpy(y_train).reshape(-1, 1)

loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=512, shuffle=True)

# ==================== MODEL ====================
# The input layer will automatically adjust due to X.shape[1] being used.
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(X.shape[1]),
            nn.Linear(X.shape[1], 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

model = Net()

# Manual Adam — no torch.optim
params = list(model.parameters())
lr_initial = 0.003
beta1, beta2 = 0.9, 0.999
m = [torch.zeros_like(p) for p in params]
v = [torch.zeros_like(p) for p in params]
t = 0

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.5]))

print("Training started — F1 every 50 epochs only\n")
best_val_f1 = 0.0

for epoch in range(300):
    t += 1
    lr = lr_initial * 0.5 * (1 + math.cos(math.pi * epoch / 300))

    model.train()
    for xb, yb in loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()

        with torch.no_grad():
            for i, p in enumerate(params):
                g = p.grad
                m[i] = beta1 * m[i] + (1 - beta1) * g
                v[i] = beta2 * v[i] + (1 - beta2) * (g * g)
                m_hat = m[i] / (1 - beta1 ** t)
                v_hat = v[i] / (1 - beta2 ** t)
                p -= lr * m_hat / (v_hat.sqrt() + 1e-8)
        model.zero_grad()

    # Print only every 50 epochs
    if epoch % 50 == 0 or epoch == 299:
        model.eval()
        with torch.no_grad():
            train_pred = (model(X_train_t).sigmoid() > 0.5).cpu().numpy().astype(int)
            val_pred = (model(X_val_t).sigmoid() > 0.5).cpu().numpy().astype(int)
            
            train_f1 = f1_score(y_train, train_pred)
            val_f1 = f1_score(y_val, val_pred)

            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                torch.save(model.state_dict(), "best.pth")
                print(f"NEW BEST → Val F1: {val_f1:.5f}")

            print(f"Epoch {epoch:3d} | Train F1: {train_f1:.5f} | Val F1: {val_f1:.5f} | Best Val: {best_val_f1:.5f}")

# ==================== FINAL SCORES (BEST MODEL) ====================
model.load_state_dict(torch.load("best.pth"))
model.eval()

with torch.no_grad():
    # Final training F1
    final_train_pred = (model(X_train_t).sigmoid() > 0.5).int().cpu().numpy().flatten()
    final_train_f1 = f1_score(y_train, final_train_pred)

    # Threshold tuning on validation
    val_probs = model(X_val_t).sigmoid().cpu().numpy().flatten()
    best_thresh = 0.5
    best_val_f1_thresh = 0
    for thresh in np.arange(0.35, 0.65, 0.01):
        pred = (val_probs > thresh).astype(int)
        f1 = f1_score(y_val, pred)
        if f1 > best_val_f1_thresh:
            best_val_f1_thresh = f1
            best_thresh = thresh

    # Final test prediction
    test_pred = (model(X_test_t).sigmoid() > best_thresh).int().flatten().cpu().numpy()

# ==================== FINAL CLEAN OUTPUT ====================
print("\n" + "="*80)
print("FINAL RESULTS — BEST MODEL + OPTIMAL THRESHOLD (WITH FEATURE ENGINEERING)")
print("="*80)
print(f"FINAL TRAINING F1           : {final_train_f1:.5f}")
print(f"FINAL VALIDATION F1         : {best_val_f1_thresh:.5f}   ←← YOUR REAL SCORE")
print(f"BEST VAL F1 DURING TRAIN    : {best_val_f1:.5f}")
print(f"OPTIMAL THRESHOLD           : {best_thresh:.3f}")
print("="*80)
print("submission_nn_fe.csv saved")
print("="*80)

# Save submission
pd.DataFrame({"patient_id": test_ids, "has_copd_risk": test_pred}).to_csv("submission_nn_fe.csv", index=False)

Training started — F1 every 50 epochs only

NEW BEST → Val F1: 0.70174
Epoch   0 | Train F1: 0.70275 | Val F1: 0.70174 | Best Val: 0.70174
NEW BEST → Val F1: 0.70596
Epoch  50 | Train F1: 0.71876 | Val F1: 0.70596 | Best Val: 0.70596
NEW BEST → Val F1: 0.70749
Epoch 100 | Train F1: 0.73638 | Val F1: 0.70749 | Best Val: 0.70749
Epoch 150 | Train F1: 0.74712 | Val F1: 0.70611 | Best Val: 0.70749
NEW BEST → Val F1: 0.71146
Epoch 200 | Train F1: 0.76512 | Val F1: 0.71146 | Best Val: 0.71146
NEW BEST → Val F1: 0.71517
Epoch 250 | Train F1: 0.77465 | Val F1: 0.71517 | Best Val: 0.71517
Epoch 299 | Train F1: 0.77367 | Val F1: 0.71300 | Best Val: 0.71517

FINAL RESULTS — BEST MODEL + OPTIMAL THRESHOLD (WITH FEATURE ENGINEERING)
FINAL TRAINING F1           : 0.77465
FINAL VALIDATION F1         : 0.72612   ←← YOUR REAL SCORE
BEST VAL F1 DURING TRAIN    : 0.71517
OPTIMAL THRESHOLD           : 0.650
submission_nn_fe.csv saved
