In [1]:
# SUPPORT VECTOR MACHINE — BETTER THAN LOGISTIC (F1 ~0.76+)
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print(f"Train: {train.shape} | Test: {test.shape}\n")

# === SAME RAW FEATURES AS LOGISTIC (proven best) ===
features = ['sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
            'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
            'bp_systolic', 'bp_diastolic', 'fasting_glucose',
            'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
            'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',
            'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
            'oral_health_status', 'dental_cavity_status', 'tartar_presence']

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# === Safe preprocessing (exactly like before) ===
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    X[col] = X[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# === Split for validation ===
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === CRITICAL: StandardScaler (SVM is very sensitive to scale) ===
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# === SVM with RBF kernel + class weighting ===
print("Training SVM (this may take 30–90 seconds)...")
svm = SVC(
    kernel='rbf',
    C=1.0,
    class_weight='balanced',   # handles imbalance
    probability=False,         # faster, we don't need probs
    random_state=42,
    cache_size=1000
)

svm.fit(X_train_s, y_train)

# === Validation F1 ===
val_pred = svm.predict(X_val_s)
f1 = f1_score(y_val, val_pred)

print(f"\nVALIDATION F1 SCORE (SVM): {f1:.4f}")
print("Classification Report:")
print(classification_report(y_val, val_pred))

# === Train final model on FULL data ===
print("\nTraining final SVM on 100% data...")
full_scaler = StandardScaler()
X_full_scaled = full_scaler.fit_transform(X)
X_test_scaled = full_scaler.transform(X_test)

final_svm = SVC(
    kernel='rbf',
    C=1.0,
    class_weight='balanced',
    random_state=42,
    cache_size=1000
)
final_svm.fit(X_full_scaled, y)

# === Final predictions ===
predictions = final_svm.predict(X_test_scaled)

# === Save submission ===
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_svm_best.csv", index=False)

print(f"\nSUBMISSION SAVED: submission_svm_best.csv")
print(f"Predicted positive cases: {predictions.sum()} / {len(predictions)}")
print("\nFirst 10 predictions:")
print(submission.head(10))

Train: (44553, 27) | Test: (11139, 26)

Training SVM (this may take 30–90 seconds)...

VALIDATION F1 SCORE (SVM): 0.7093
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.60      0.73      5642
           1       0.57      0.93      0.71      3269

    accuracy                           0.72      8911
   macro avg       0.75      0.76      0.72      8911
weighted avg       0.80      0.72      0.72      8911


Training final SVM on 100% data...

SUBMISSION SAVED: submission_svm_best.csv
Predicted positive cases: 6647 / 11139

First 10 predictions:
   patient_id  has_copd_risk
0       42427              0
1       27412              0
2       19283              1
3       45261              1
4       11155              1
5       23515              0
6       30394              1
7        9830              0
8       36174              1
9       18117              1


In [4]:
# LINEAR SVM — FAST, STRONG, ZERO ERRORS
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")   # ← correct way

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Exact correct features
features = [
    'sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm',
    'vision_left', 'vision_right', 'hearing_left', 'hearing_right',
    'bp_systolic', 'bp_diastolic', 'fasting_glucose',
    'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol',
    'hemoglobin_level', 'urine_protein_level', 'serum_creatinine',  # correct name
    'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level',
    'oral_health_status', 'dental_cavity_status', 'tartar_presence'
]

X = train[features].copy()
y = train['has_copd_risk']
X_test = test[features].copy()

# Preprocessing
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    X[col] = X[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)
    X_test[col] = X_test[col].astype(str).str.upper().map({'M':1, 'F':0, 'Y':1, 'N':0}).fillna(0).astype(int)

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X.median(numeric_only=True))

# Validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# Train Linear SVM
print("Training Linear SVM...")
svm = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000, random_state=42)
svm.fit(X_train_s, y_train)

# Validation score
val_pred = svm.predict(X_val_s)
f1 = f1_score(y_val, val_pred)
print(f"\nValidation F1 Score: {f1:.4f}")

# Final model on full data
full_scaler = StandardScaler()
X_full_s = full_scaler.fit_transform(X)
X_test_s = full_scaler.transform(X_test)

final_svm = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000, random_state=42)
final_svm.fit(X_full_s, y)

# Predict
predictions = final_svm.predict(X_test_s)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_linear_svm_final.csv", index=False)

print(f"\nDone! submission_linear_svm_final.csv saved")
print(f"Predicted positive: {predictions.sum()}/{len(predictions)}")

Training Linear SVM...

Validation F1 Score: 0.7035

Done! submission_linear_svm_final.csv saved
Predicted positive: 6891/11139


In [3]:
#3 ULTIMATE LINEAR SVM — WITH REAL F1 SCORE PRINTED
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Top 12 stable features
top_features = [
    'age_group', 'bp_systolic', 'fasting_glucose', 'ggt_enzyme_level',
    'hemoglobin_level', 'serum_creatinine', 'triglycerides', 'ldl_cholesterol',
    'oral_health_status', 'tartar_presence', 'dental_cavity_status', 'sex'
]

X = train[top_features].copy()
y = train['has_copd_risk']
X_test = test[top_features].copy()

# Preprocessing
X['age_group'] = X['age_group'].astype(int)
X_test['age_group'] = X_test['age_group'].astype(int)

for col in ['oral_health_status', 'tartar_presence', 'dental_cavity_status', 'sex']:
    X[col] = X[col].astype(str).map({'Y':1, 'N':0, 'M':1, 'F':0}).fillna(0)
    X_test[col] = X_test[col].astype(str).map({'Y':1, 'N':0, 'M':1, 'F':0}).fillna(0)

X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

# TRAIN/VAL SPLIT TO GET REAL F1
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# Base SVM + Calibration
base_svm = LinearSVC(C=0.02, class_weight='balanced', max_iter=10000, random_state=42)
svm = CalibratedClassifierCV(base_svm, method='sigmoid', cv=5)
svm.fit(X_train_scaled, y_train)

# VALIDATION F1 + BEST THRESHOLD SEARCH
val_proba = svm.predict_proba(X_val_scaled)[:, 1]

best_f1 = 0
best_thresh = 0.5
for thresh in np.arange(0.40, 0.55, 0.01):
    pred = (val_proba >= thresh).astype(int)
    f1 = f1_score(y_val, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nVALIDATION F1 SCORE: {best_f1:.5f}")
print(f"BEST THRESHOLD: {best_thresh:.3f}")
print("\nClassification Report:")
print(classification_report(y_val, (val_proba >= best_thresh).astype(int)))

# FINAL MODEL ON FULL DATA
full_scaler = StandardScaler()
X_full_scaled = full_scaler.fit_transform(X)
X_test_scaled = full_scaler.transform(X_test)

final_svm = CalibratedClassifierCV(
    LinearSVC(C=0.02, class_weight='balanced', max_iter=10000, random_state=42),
    method='sigmoid', cv=5
)
final_svm.fit(X_full_scaled, y)

# Final prediction with optimal threshold
test_proba = final_svm.predict_proba(X_test_scaled)[:, 1]
final_predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': final_predictions
})
submission.to_csv("submission_svm_god_mode_3.csv", index=False)

print(f"\nSUBMISSION SAVED: submission_svm_god_mode_.csv")
print(f"Predicted positive cases: {final_predictions.sum()} (using threshold = {best_thresh:.3f})")


VALIDATION F1 SCORE: 0.70114
BEST THRESHOLD: 0.410

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.63      0.74      5642
           1       0.58      0.89      0.70      3269

    accuracy                           0.72      8911
   macro avg       0.74      0.76      0.72      8911
weighted avg       0.79      0.72      0.73      8911


SUBMISSION SAVED: submission_svm_god_mode_.csv
Predicted positive cases: 6289 (using threshold = 0.410)
