In [None]:
#1 LINEAR SVM — FAST, STRONG, ZERO ERRORS
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")   # ← correct way

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Exact correct features
features = [
    'sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
    'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
    'bp_systolic', 'bp_diastolic', 'fasting_glucose',
    'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
    'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
    'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
    'oral_health_status', 'dental_cavity_status', 'tartar_presence'
]

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# Preprocessing
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    X[col] = X[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# Train Linear SVM
print("Training Linear SVM...")
svm = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000, random_state=42)
svm.fit(X_train_s, y_train)

# Training F1 score
train_pred = svm.predict(X_train_s)
train_f1 = f1_score(y_train, train_pred)
print(f"Training F1 Score: {train_f1:.4f}")

# Validation F1 score
val_pred = svm.predict(X_val_s)
val_f1 = f1_score(y_val, val_pred)
print(f"Validation F1 Score: {val_f1:.4f}")

# Final model on full data
full_scaler = StandardScaler()
X_full_s = full_scaler.fit_transform(X)
X_test_s = full_scaler.transform(X_test)

final_svm = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000, random_state=42)
final_svm.fit(X_full_s, y)

# Predict on test
predictions = final_svm.predict(X_test_s)

# Save submission
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_linear_svm_final.csv", index=False)

print("\nDone! submission_linear_svm_final.csv saved")
print(f"Predicted positive: {predictions.sum()}/{len(predictions)}")


Training Linear SVM...
Training F1 Score: 0.7042
Validation F1 Score: 0.7035

Done! submission_linear_svm_final.csv saved
Predicted positive: 6891/11139


In [None]:
#2 ULTIMATE LINEAR SVM — WITH REAL F1 SCORE PRINTED
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Top 12 stable features
top_features = [
    'age_group', 'bp_systolic', 'fasting_glucose', 'ggt_enzyme_level',
    'hemoglobin_level', 'serum_creatinine', 'triglycerides', 'ldl_cholesterol',
    'oral_health_status', 'tartar_presence', 'dental_cavity_status', 'sex'
]

X = train[top_features].copy()
y = train['has_copd_risk']
X_test = test[top_features].copy()

# Preprocessing
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['oral_health_status', 'tartar_presence', 'dental_cavity_status', 'sex']:
    X[col] = X[col].astype(str).map({'Y':1, 'N':0, 'M':1, 'F':0}).fillna(0)
    X_test[col] = X_test[col].astype(str).map({'Y':1, 'N':0, 'M':1, 'F':0}).fillna(0)

X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

# TRAIN/VAL SPLIT TO GET REAL F1
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# Base SVM + Calibration
base_svm = LinearSVC(C=0.02, class_weight='balanced', max_iter=10000, random_state=42)
svm = CalibratedClassifierCV(base_svm, method='sigmoid', cv=5)
svm.fit(X_train_scaled, y_train)

# VALIDATION F1 + BEST THRESHOLD SEARCH
val_proba = svm.predict_proba(X_val_scaled)[:, 1]

best_f1 = 0
best_thresh = 0.5
for thresh in np.arange(0.40, 0.55, 0.01):
    pred = (val_proba >= thresh).astype(int)
    f1 = f1_score(y_val, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nVALIDATION F1 SCORE: {best_f1:.5f}")
print(f"BEST THRESHOLD: {best_thresh:.3f}")

# TRAINING F1 SCORE (using same threshold)
train_proba = svm.predict_proba(X_train_scaled)[:, 1]
train_pred = (train_proba >= best_thresh).astype(int)
train_f1 = f1_score(y_train, train_pred)
print(f"TRAINING F1 SCORE: {train_f1:.5f}")

print("\nClassification Report:")
print(classification_report(y_val, (val_proba >= best_thresh).astype(int)))

# FINAL MODEL ON FULL DATA
full_scaler = StandardScaler()
X_full_scaled = full_scaler.fit_transform(X)
X_test_scaled = full_scaler.transform(X_test)

final_svm = CalibratedClassifierCV(
    LinearSVC(C=0.02, class_weight='balanced', max_iter=10000, random_state=42),
    method='sigmoid', cv=5
)
final_svm.fit(X_full_scaled, y)

# Final prediction with optimal threshold
test_proba = final_svm.predict_proba(X_test_scaled)[:, 1]
final_predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': final_predictions
})
submission.to_csv("submission_svm_god_mode_3.csv", index=False)

print(f"\nSUBMISSION SAVED: submission_svm_god_mode_3.csv")
print(f"Predicted positive cases: {final_predictions.sum()} (threshold = {best_thresh:.3f})")



VALIDATION F1 SCORE: 0.70114
BEST THRESHOLD: 0.410
TRAINING F1 SCORE: 0.70615

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.63      0.74      5642
           1       0.58      0.89      0.70      3269

    accuracy                           0.72      8911
   macro avg       0.74      0.76      0.72      8911
weighted avg       0.79      0.72      0.73      8911


SUBMISSION SAVED: submission_svm_god_mode_3.csv
Predicted positive cases: 6289 (threshold = 0.410)


In [4]:
#3
#  SUPPORT VECTOR MACHINE — BETTER THAN LOGISTIC (F1 ~0.76+)
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print(f"Train: {train.shape} | Test: {test.shape}\n")

# === SAME RAW FEATURES AS LOGISTIC (proven best) ===
features = ['sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
            'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
            'bp_systolic', 'bp_diastolic', 'fasting_glucose',
            'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
            'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
            'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
            'oral_health_status', 'dental_cavity_status', 'tartar_presence']

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# === Safe preprocessing (exactly like before) ===
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    X[col] = X[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# === Split for validation ===
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === StandardScaler (SVM is very sensitive to scale) ===
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# === SVM with RBF kernel + class weighting ===
print("Training SVM (this may take 30–90 seconds)...")
svm = SVC(
    kernel='rbf',
    C=1.0,
    class_weight='balanced',
    probability=False,
    random_state=42,
    cache_size=1000
)

svm.fit(X_train_s, y_train)

# === Training F1 ===
train_pred = svm.predict(X_train_s)
train_f1 = f1_score(y_train, train_pred)
print(f"\nTRAINING F1 SCORE (SVM): {train_f1:.4f}")

# === Validation F1 ===
val_pred = svm.predict(X_val_s)
f1 = f1_score(y_val, val_pred)

print(f"VALIDATION F1 SCORE (SVM): {f1:.4f}")
print("Classification Report:")
print(classification_report(y_val, val_pred))

# === Train final model on FULL data ===
print("\nTraining final SVM on 100% data...")
full_scaler = StandardScaler()
X_full_scaled = full_scaler.fit_transform(X)
X_test_scaled = full_scaler.transform(X_test)

final_svm = SVC(
    kernel='rbf',
    C=1.0,
    class_weight='balanced',
    random_state=42,
    cache_size=1000
)
final_svm.fit(X_full_scaled, y)

# === Final predictions ===
predictions = final_svm.predict(X_test_scaled)

# === Save submission ===
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_svm_best.csv", index=False)

print(f"\nSUBMISSION SAVED: submission_svm_best.csv")
print(f"Predicted positive cases: {predictions.sum()} / {len(predictions)}")
print("\nFirst 10 predictions:")
print(submission.head(10))


Train: (44553, 27) | Test: (11139, 26)

Training SVM (this may take 30–90 seconds)...

TRAINING F1 SCORE (SVM): 0.7217
VALIDATION F1 SCORE (SVM): 0.7093
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.60      0.73      5642
           1       0.57      0.93      0.71      3269

    accuracy                           0.72      8911
   macro avg       0.75      0.76      0.72      8911
weighted avg       0.80      0.72      0.72      8911


Training final SVM on 100% data...

SUBMISSION SAVED: submission_svm_best.csv
Predicted positive cases: 6647 / 11139

First 10 predictions:
   patient_id  has_copd_risk
0       42427              0
1       27412              0
2       19283              1
3       45261              1
4       11155              1
5       23515              0
6       30394              1
7        9830              0
8       36174              1
9       18117              1


In [None]:
#4
# Ultimate "Option A" — Most accurate SVM pipeline (RFECV + GridSearch + Calibration + threshold tuning)
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_predict
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# User: adjust filenames if needed
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
SUBMISSION_CSV = "submission_svm_ultimate_optimal.csv"
# -------------------------

# Load
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# Full feature set (from your earlier long version)
features = [
    'sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
    'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
    'bp_systolic', 'bp_diastolic', 'fasting_glucose',
    'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
    'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
    'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
    'oral_health_status', 'dental_cavity_status', 'tartar_presence'
]

# Subset (defensive)
X_all = train[features].copy()
y_all = train['has_copd_risk'].copy()
X_test_all = test[features].copy()

# -------------------------
# Preprocessing (consistent mapping)
# -------------------------
# age_group to int if possible
X_all['age_group'] = X_all['age_group'].astype(int)
X_test_all['age_group'] = X_test_all['age_group'].astype(int)

# map categorical columns
for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    X_all[col] = X_all[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)
    X_test_all[col] = X_test_all[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)

# numeric missing fill (median)
X_all = X_all.fillna(X_all.median(numeric_only=True))
X_test_all = X_test_all.fillna(X_all.median(numeric_only=True))  # use train medians for test

# -------------------------
# Hold-out validation split (we keep a final holdout for threshold tuning)
# -------------------------
X_train_inner, X_val_holdout, y_train_inner, y_val_holdout = train_test_split(
    X_all, y_all, test_size=0.20, random_state=42, stratify=y_all
)

print(f"Train inner shape: {X_train_inner.shape}, Holdout val shape: {X_val_holdout.shape}")

# -------------------------
# Step 1: RFECV for feature selection on the inner training set
# (this reduces features in a CV-aware way)
# -------------------------
print("\nRunning RFECV (this can be slow) ...")
svc_for_rfe = LinearSVC(C=1.0, class_weight='balanced', max_iter=10000, random_state=42)
cv_rfe = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# RFECV scoring uses F1 (binary)
rfe = RFECV(
    estimator=svc_for_rfe,
    step=1,
    cv=cv_rfe,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
rfe.fit(X_train_inner, y_train_inner)

# features kept
support_mask = rfe.support_
selected_features = [f for f, keep in zip(X_train_inner.columns.tolist(), support_mask) if keep]
print(f"\nRFECV selected {len(selected_features)} features out of {len(features)}:")
print(selected_features)

# Reduce datasets to selected features
X_train_sel = X_train_inner[selected_features].copy()
X_val_sel   = X_val_holdout[selected_features].copy()
X_test_sel  = X_test_all[selected_features].copy()

# -------------------------
# Step 2: Build pipeline and GridSearchCV for best scaler, C, and calibration method
# -------------------------
print("\nGrid search over scalers, C and calibration method (this is nested / expensive)...")

# Pipeline structure:
#  - scaler (placeholder)
#  - calibrated classifier (wraps LinearSVC)
pipe = Pipeline([
    ('scaler', StandardScaler()),  # will be replaced via param grid
    ('clf', CalibratedClassifierCV(LinearSVC(class_weight='balanced', max_iter=10000, random_state=42), method='sigmoid', cv=5))
])

# Parameter grid — use 'clf__estimator__C' (not base_estimator)
param_grid = {
    'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
    'clf__estimator__C': [0.001, 0.01, 0.02, 0.05, 0.1, 0.5, 1.0],
    'clf__method': ['sigmoid', 'isotonic']  # isotonic may be slower and needs more data — included for completeness
}

cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv_outer,
    scoring='f1',
    n_jobs=-1,
    verbose=2,
    refit=True
)

# Fit grid on X_train_sel (inner training)
grid.fit(X_train_sel, y_train_inner)

print("\nGrid search done.")
print("Best params:", grid.best_params_)
print("Best CV f1 (on inner training during grid search):", grid.best_score_)

best_pipe = grid.best_estimator_

# -------------------------
# Step 3: Determine best probability threshold on holdout validation (val_holdout)
# -------------------------
print("\nTuning decision threshold on holdout validation set ...")
val_proba = best_pipe.predict_proba(X_val_sel)[:, 1]

best_thresh = 0.5
best_val_f1 = 0.0
for thr in np.arange(0.30, 0.71, 0.01):  # wide search 0.30-0.70
    val_pred = (val_proba >= thr).astype(int)
    f1 = f1_score(y_val_holdout, val_pred)
    if f1 > best_val_f1:
        best_val_f1 = f1
        best_thresh = thr

print(f"Best holdout validation F1: {best_val_f1:.5f} at threshold {best_thresh:.3f}")
print("\nClassification report on holdout (using best threshold):")
print(classification_report(y_val_holdout, (val_proba >= best_thresh).astype(int)))

# -------------------------
# Step 4: Honest "training" F1 via cross-validated probabilities on the inner training set
# (we use cross_val_predict with the best pipeline and compute F1 using the chosen threshold)
# -------------------------
print("\nComputing cross-validated 'training' F1 on inner training set (honest CV) ...")
# cross_val_predict with method='predict_proba' works if estimator implements it — our pipeline does because clf is calibrated
train_cv_proba = cross_val_predict(best_pipe, X_train_sel, y_train_inner, cv=cv_outer, method='predict_proba', n_jobs=-1)[:, 1]
train_cv_pred = (train_cv_proba >= best_thresh).astype(int)
train_cv_f1 = f1_score(y_train_inner, train_cv_pred)
print(f"Cross-validated Training F1 (using selected threshold): {train_cv_f1:.5f}")

# -------------------------
# Step 5: Retrain final model on FULL train data (inner train + holdout) using selected features & best params
# -------------------------
print("\nRetraining best pipeline on FULL training data (for final test predictions)...")
X_full_sel = X_all[selected_features].copy()
X_test_sel  = X_test_all[selected_features].copy()

# Recreate pipeline with found best params (explicitly)
final_scaler = grid.best_params_['scaler']
final_C = grid.best_params_['clf__estimator__C']
final_method = grid.best_params_['clf__method']

final_pipe = Pipeline([
    ('scaler', final_scaler),
    ('clf', CalibratedClassifierCV(LinearSVC(C=final_C, class_weight='balanced', max_iter=10000, random_state=42),
                                  method=final_method, cv=5))
])

final_pipe.fit(X_full_sel, y_all)

# Predict probabilities on test, apply threshold
test_proba = final_pipe.predict_proba(X_test_sel)[:, 1]
test_pred = (test_proba >= best_thresh).astype(int)

# Save submission
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': test_pred
})
submission.to_csv(SUBMISSION_CSV, index=False)

# Final reporting
print("\n=== SUMMARY ===")
print(f"Selected features ({len(selected_features)}): {selected_features}")
print(f"Best grid params: scaler={final_scaler}, C={final_C}, calibration_method={final_method}")
print(f"Inner-grid CV best f1: {grid.best_score_:.5f}")
print(f"Cross-validated training F1 (inner training): {train_cv_f1:.5f}")
print(f"Holdout validation F1 (after threshold tune): {best_val_f1:.5f} at threshold {best_thresh:.3f}")
print(f"Test positives predicted: {test_pred.sum()} / {len(test_pred)}")
print(f"Submission saved to: {SUBMISSION_CSV}")

# Optionally show top coefficients (if you want interpretability)
try:
    # Try both attribute names used across sklearn versions
    clf_wrapper = final_pipe.named_steps['clf']
    base_clf = getattr(clf_wrapper, "estimator", None) or getattr(clf_wrapper, "base_estimator", None)
    if base_clf is not None:
        coefs = base_clf.coef_.ravel()
        feat_imp = pd.DataFrame({
            'feature': selected_features,
            'coef': coefs
        }).sort_values(key=lambda df: df['coef'].abs(), ascending=False)
        print("\nTop feature coefficients (absolute):")
        print(feat_imp.head(12).to_string(index=False))
    else:
        print("\nCould not find underlying estimator coefficients (sklearn version mismatch).")
except Exception:
    pass


Train inner shape: (35642, 25), Holdout val shape: (8911, 25)

Running RFECV (this can be slow) ...
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.

RFECV selected 21 features out of 25:
['sex', 'age_group', 'height_cm', 'weight_kg', 'vision_left', 'vision_right', 'hearing_left', 'hearing_right', 'bp_systolic', 'bp_diastolic', 'fasting_glucose', 'total_cholesterol', 'triglycerides', 'hemoglobin_level', 'urine_protein_level', 'serum_creatinine', 'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level', 'oral_health_status', 'tartar_presence']

Grid search over scalers, C and calibration method (this is nested / expensive)...
Fitting 5 folds for each of 42 candidates, totalling 210 fits

Grid search done.
Best params: {'clf__estimator__C': 0.001, 'clf__method': 'sigmoid', 'scaler': MinMaxScaler()}
Best CV f1 (on inner training during grid search): 0.6750460069997741

Tuning decision threshol

In [None]:
#5
# ============================================================
# OPTION A — FASTEST HIGH-F1 SVM with One-Hot Encoding + Training F1
# RBF approximation using Random Fourier Features + Linear SVM
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# Load data
# -------------------------
df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")

X = df_train.drop(["has_copd_risk", "patient_id"], axis=1)
y = df_train["has_copd_risk"]
test_ids = df_test["patient_id"]
X_test = df_test.drop(["patient_id"], axis=1)

# -------------------------
# Identify categorical and numeric columns
# -------------------------
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols     = X.select_dtypes(include=[np.number]).columns.tolist()

# -------------------------
# Preprocessor: One-Hot + Scaling
# -------------------------
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# -------------------------
# Train/Validation Split
# -------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Pipeline: Preprocessing → RBF Approx → Linear SVM → Calibration
# -------------------------
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("rbf", RBFSampler(random_state=42)),
    ("svm", CalibratedClassifierCV(
        LinearSVC(class_weight="balanced", max_iter=5000),
        method="sigmoid",
        cv=3
    ))
])

# -------------------------
# Hyperparameter Search Space
# -------------------------
param_grid = {
    "rbf__n_components": [200, 300, 400, 500, 700],
    "rbf__gamma": loguniform(1e-3, 1e0),
    "svm__estimator__C": loguniform(1e-2, 1e2)
}

f1_scorer = make_scorer(f1_score)

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    scoring=f1_scorer,
    n_iter=20,
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# -------------------------
# Train FAST SVM
# -------------------------
search.fit(X_train, y_train)

print("\nBest hyperparameters:", search.best_params_)

# -------------------------
# Training F1
# -------------------------
train_pred = search.predict(X_train)
train_f1 = f1_score(y_train, train_pred)
print(f"Training F1 Score: {train_f1:.4f}")

# -------------------------
# Validation F1
# -------------------------
val_pred = search.predict(X_val)
val_f1 = f1_score(y_val, val_pred)
print(f"Validation F1 Score: {val_f1:.4f}")

# -------------------------
# Predict Test & Save Submission
# -------------------------
y_test_pred = search.predict(X_test)

submission = pd.DataFrame({
    "patient_id": test_ids,
    "has_copd_risk": y_test_pred.astype(int)
})

submission.to_csv("submission_fast_svm_5.csv", index=False)
print("\nSubmission saved to 'submission_fast_svm.csv'")


Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best hyperparameters: {'rbf__gamma': np.float64(0.002323350351539011), 'rbf__n_components': 700, 'svm__estimator__C': np.float64(21.51689729808333)}
Training F1 Score: 0.6883
Validation F1 Score: 0.6818

Submission saved to 'submission_fast_svm.csv'


In [6]:
#6
#  ============================================================
# IMPROVED TRUE RBF SVM — FAST + HIGH-F1
# ============================================================

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

features = ['sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
            'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
            'bp_systolic', 'bp_diastolic', 'fasting_glucose',
            'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
            'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
            'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
            'oral_health_status', 'dental_cavity_status', 'tartar_presence']

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# Cleaning
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex','oral_health_status','dental_cavity_status','tartar_presence']:
    mapping = {'M':1,'F':0,'Y':1,'N':0}
    X[col] = X[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# ============================================================
# 1) FAST C-SEARCH ON 10–15% SUBSET
# ============================================================
sub_idx = np.random.choice(len(X_train_s), size=int(0.15 * len(X_train_s)), replace=False)
Xs = X_train_s[sub_idx]
ys = y_train.iloc[sub_idx]

C_values = [0.5, 1.0, 2.0, 3.0, 5.0]
best_f1 = -1
best_C = 1.0

print("\nTuning C on subset...")
for C in C_values:
    svm_temp = SVC(
        kernel='rbf',
        C=C,
        class_weight='balanced',
        probability=False,
        shrinking=True,
        cache_size=1200,
        random_state=42
    )
    svm_temp.fit(Xs, ys)
    pred = svm_temp.predict(X_val_s)
    f1 = f1_score(y_val, pred)
    print(f"C={C} → F1={f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_C = C

print(f"\nBEST C FOUND: {best_C}")

# ============================================================
# 2) TRAIN FINAL MODEL (FULL DATA) using best C
# ============================================================
print("\nTraining FINAL SVM (may take ~10-15 min)...")
final_svm = SVC(
    kernel='rbf',
    C=best_C,
    class_weight='balanced',
    probability=False,
    shrinking=True,
    cache_size=1500,
    random_state=42
)

# Fit on FULL TRAINING DATA (X, y)
X_full_scaled = scaler.fit_transform(X)
final_svm.fit(X_full_scaled, y)

# -------- TRAINING F1 --------
train_pred = final_svm.predict(X_full_scaled)
train_f1 = f1_score(y, train_pred)
print(f"\nTRAINING F1 (FULL DATA): {train_f1:.4f}")

# -------- VALIDATION F1 --------
# (Use the earlier X_val_s and y_val)
val_pred = final_svm.predict(X_val_s)
val_f1 = f1_score(y_val, val_pred)
print(f"VALIDATION F1: {val_f1:.4f}\n")

# -------- Test predictions --------
X_test_scaled = scaler.transform(X_test)
predictions = final_svm.predict(X_test_scaled)

# Save file
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_rbf_svm_improved_6.csv", index=False)

print("Saved: submission_rbf_svm_improved_6.csv")
print("Positive cases:", predictions.sum(), "/", len(predictions))



Tuning C on subset...
C=0.5 → F1=0.7000
C=1.0 → F1=0.7037
C=2.0 → F1=0.7033
C=3.0 → F1=0.7005
C=5.0 → F1=0.6979

BEST C FOUND: 1.0

Training FINAL SVM (may take ~10-15 min)...

TRAINING F1 (FULL DATA): 0.7206
VALIDATION F1: 0.7178

Saved: submission_rbf_svm_improved_6.csv
Positive cases: 6647 / 11139


In [1]:
#7
# ============================================================
# IMPROVED TRUE RBF SVM + FEATURE ENGINEERING — FAST + HIGH-F1
# ============================================================

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# ---------------------------
# FEATURE ENGINEERING
# ---------------------------

def add_features(df):
    df = df.copy()
    
    # BMI
    df["bmi"] = df["weight_kg"] / ((df["height_cm"] / 100) ** 2)
    
    # Pulse pressure (BP predictor)
    df["pulse_pressure"] = df["bp_systolic"] - df["bp_diastolic"]
    
    # Lipid ratios
    df["chol_hdl_ratio"] = df["total_cholesterol"] / df["hdl_cholesterol"].replace(0, np.nan)
    df["ldl_hdl_ratio"] = df["ldl_cholesterol"] / df["hdl_cholesterol"].replace(0, np.nan)
    df["tg_hdl_ratio"]  = df["triglycerides"]     / df["hdl_cholesterol"].replace(0, np.nan)
    
    # Glucose metabolism
    df["glucose_hdl_ratio"] = df["fasting_glucose"] / df["hdl_cholesterol"].replace(0, np.nan)
    
    # creatinine to weight ratio (kidney/muscle)
    df["creatinine_to_weight"] = df["serum_creatinine"] / df["weight_kg"].replace(0, np.nan)

    # Replace any inf
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

train = add_features(train)
test  = add_features(test)

# Main features + engineered ones
features = [
    'sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
    'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
    'bp_systolic', 'bp_diastolic', 'fasting_glucose',
    'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
    'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
    'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
    'oral_health_status', 'dental_cavity_status', 'tartar_presence',

    # engineered features:
    'bmi', 'pulse_pressure', 'chol_hdl_ratio', 'ldl_hdl_ratio', 'tg_hdl_ratio',
    'glucose_hdl_ratio', 'creatinine_to_weight'
]

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# Cleaning categorical
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex','oral_health_status','dental_cavity_status','tartar_presence']:
    mapping = {'M':1,'F':0,'Y':1,'N':0}
    X[col] = X[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)

# Fill NA with median
X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# ============================================================
# FAST C-SEARCH (subset)
# ============================================================
sub_idx = np.random.choice(len(X_train_s), size=int(0.15 * len(X_train_s)), replace=False)
Xs = X_train_s[sub_idx]
ys = y_train.iloc[sub_idx]

C_values = [0.5, 1.0, 2.0, 3.0, 5.0]
best_f1 = -1
best_C = 1.0

print("\nTuning C on subset...")
for C in C_values:
    model = SVC(
        kernel='rbf',
        C=C,
        class_weight='balanced',
        probability=False,
        shrinking=True,
        cache_size=1200,
        random_state=42
    )
    model.fit(Xs, ys)
    pred = model.predict(X_val_s)
    f1 = f1_score(y_val, pred)
    print(f"C={C} → F1={f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_C = C

print(f"\nBEST C FOUND: {best_C}")

# ============================================================
# TRAIN FINAL MODEL
# ============================================================
print("\nTraining FINAL SVM...")
final_svm = SVC(
    kernel='rbf',
    C=best_C,
    class_weight='balanced',
    shrinking=True,
    probability=False,
    cache_size=1500,
    random_state=42
)

X_full_scaled = scaler.fit_transform(X)
final_svm.fit(X_full_scaled, y)

# TRAINING F1
train_pred = final_svm.predict(X_full_scaled)
train_f1 = f1_score(y, train_pred)
print(f"\nTRAINING F1 (FULL): {train_f1:.4f}")

# VALIDATION F1
val_pred = final_svm.predict(X_val_s)
val_f1 = f1_score(y_val, val_pred)
print(f"VALIDATION F1: {val_f1:.4f}\n")

# TEST PREDICTION
predictions = final_svm.predict(X_test_s)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_rbf_svm_7.csv", index=False)

print("Saved: submission_rbf_svm_FE.csv")
print("Positive cases:", predictions.sum(), "/", len(predictions))



Tuning C on subset...
C=0.5 → F1=0.7013
C=1.0 → F1=0.7043
C=2.0 → F1=0.7056
C=3.0 → F1=0.7026
C=5.0 → F1=0.6982

BEST C FOUND: 2.0

Training FINAL SVM...

TRAINING F1 (FULL): 0.7310
VALIDATION F1: 0.7286

Saved: submission_rbf_svm_FE.csv
Positive cases: 6491 / 11139


In [2]:
#8
# ============================================================
# IMPROVED TRUE RBF SVM — FEATURE ENGINEERED (NO EXTRA TIME)
# ============================================================

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Original features
features = ['sex', 'age_group', 'height_cm', 'weight_kg',
            'waist_circumference_cm', 'vision_left', 'vision_right',
            'hearing_left', 'hearing_right', 'bp_systolic',
            'bp_diastolic', 'fasting_glucose', 'total_cholesterol',
            'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
            'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
            'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
            'oral_health_status', 'dental_cavity_status', 'tartar_presence']

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# ============================================================
# FEATURE ENGINEERING (FREE BOOSTS)
# ============================================================

def add_features(df):
    # BMI
    df["BMI"] = df["weight_kg"] / (df["height_cm"] / 100) ** 2

    # Pulse pressure
    df["pulse_pressure"] = df["bp_systolic"] - df["bp_diastolic"]

    # Asymmetry features
    df["vision_diff"] = (df["vision_left"] - df["vision_right"]).abs()
    df["hearing_diff"] = (df["hearing_left"] - df["hearing_right"]).abs()

    # Log transforms for skewed lab values
    df["log_triglycerides"] = np.log1p(df["triglycerides"])
    df["log_alt"] = np.log1p(df["alt_enzyme_level"])
    df["log_ggt"] = np.log1p(df["ggt_enzyme_level"])

    return df

X = add_features(X)
X_test = add_features(X_test)

# Encode binary categorical
for col in ['sex','oral_health_status','dental_cavity_status','tartar_presence']:
    mapping = {'M':1,'F':0,'Y':1,'N':0}
    X[col] = X[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map(mapping).fillna(0).astype(int)

X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

# Fill missing
X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# ============================================================
# TRAIN/VAL SPLIT
# ============================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# ============================================================
# 1) FAST C-SEARCH ON SUBSET
# ============================================================
sub_idx = np.random.choice(len(X_train_s), size=int(0.15 * len(X_train_s)), replace=False)
Xs = X_train_s[sub_idx]
ys = y_train.iloc[sub_idx]

C_values = [0.5, 1.0, 2.0, 3.0, 5.0]
best_f1 = -1
best_C = 1.0

print("\nTuning C on subset...")
for C in C_values:
    svm_temp = SVC(kernel='rbf', C=C, class_weight='balanced',
                   shrinking=True, cache_size=1200, probability=False)
    svm_temp.fit(Xs, ys)
    pred = svm_temp.predict(X_val_s)
    f1 = f1_score(y_val, pred)
    print(f"C={C} → F1={f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_C = C

print(f"\nBEST C FOUND: {best_C}")

# ============================================================
# 2) TRAIN FINAL MODEL (FULL DATA)
# ============================================================
print("\nTraining FINAL SVM...")
X_full_scaled = scaler.fit_transform(X)

final_svm = SVC(
    kernel='rbf',
    C=best_C,
    class_weight='balanced',
    shrinking=True,
    cache_size=1500,
    probability=False
)

final_svm.fit(X_full_scaled, y)

# TRAIN F1
train_pred = final_svm.predict(X_full_scaled)
train_f1 = f1_score(y, train_pred)
print(f"\nTRAINING F1 (FULL): {train_f1:.4f}")

# VAL F1
val_pred = final_svm.predict(X_val_s)
val_f1 = f1_score(y_val, val_pred)
print(f"VALIDATION F1: {val_f1:.4f}\n")

# TEST PREDICTIONS
predictions = final_svm.predict(X_test_s)

submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_rbf_svm_feature_engineered_8.csv", index=False)

print("Saved: submission_rbf_svm_feature_engineered.csv")
print("Positive cases:", predictions.sum(), "/", len(predictions))



Tuning C on subset...
C=0.5 → F1=0.7025
C=1.0 → F1=0.7051
C=2.0 → F1=0.7068
C=3.0 → F1=0.7059
C=5.0 → F1=0.7001

BEST C FOUND: 2.0

Training FINAL SVM...

TRAINING F1 (FULL): 0.7343
VALIDATION F1: 0.7320

Saved: submission_rbf_svm_feature_engineered.csv
Positive cases: 6362 / 11139
