In [132]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from pathlib import Path

RANDOM_STATE = 42



In [140]:
# Paths (adjust if needed)
from pathlib import Path
BASE = Path("/Users/YvanLongin/Google Drive/My Drive/Boston College Applied Econ Program/Big Data Econometrics (Fall 25)/Assignments/HW 2")
RAW = BASE / "RawData"
PROCESSED = BASE / "ProcessedData"
OUTPUT = BASE / "Output"
for p in (RAW, PROCESSED, OUTPUT):
    p.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = RAW / "train.csv"
TEST_PATH = RAW / "test.csv"

In [143]:
print("train shape:", train.shape, "test shape:", test.shape)


train shape: (12243, 12) test shape: (3061, 11)


In [160]:

# Identify ID, target, features
ID_COL = "id" if "id" in train.columns.str.lower() else "ID"
# prefer actual column name if known
if "id" in train.columns:
    ID_COL = "id"
elif "ID" in train.columns:
    ID_COL = "ID"
else:
    # fallback to first column if typical Kaggle layout
    ID_COL = train.columns[0]

TARGET = "stroke"  # dataset target; adjust if different

# If dataset uses different capitalization, make it uniform
if TARGET not in train.columns:
    # try lowercase match
    lower_map = {c.lower(): c for c in train.columns}
    if TARGET in lower_map:
        TARGET = lower_map[TARGET]
    else:
        raise ValueError(f"Target column 'stroke' not found. Columns: {train.columns.tolist()}")

# Basic cleaning rules (simple, reproducible)
def preprocess_tables(train_df, test_df, target_col):
    # Combine to ensure same encodings
    train_df = train_df.copy()
    test_df = test_df.copy()
    test_ids = test_df[ID_COL].copy()

    # Drop duplicates (if any)
    train_df = train_df.drop_duplicates(subset=ID_COL)

    # Example of converting some common columns:
    # If 'bmi' is string with 'N/A', convert to numeric
    if 'bmi' in train_df.columns:
        for df in (train_df, test_df):
            df['bmi'] = pd.to_numeric(df['bmi'], errors='coerce')

    # Separate features and target
    X = train_df.drop(columns=[target_col, ID_COL])
    y = train_df[target_col].astype(int)

    X_test = test_df.drop(columns=[ID_COL])

    # Identify numerical vs categorical
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # Common simple imputers/transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], remainder='drop')

    return X, y, X_test, preprocessor, numeric_cols, categorical_cols, test_ids

X, y, X_test, preprocessor, numeric_cols, categorical_cols, test_ids = preprocess_tables(train, test, TARGET)

print(f"Numeric cols: {numeric_cols}")
print(f"Categorical cols: {categorical_cols}")

# Split train for evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Simple model definitions (allowed methods)
models = {
    "logistic_l2": LogisticRegression(max_iter=2000, solver='saga', penalty='l2', class_weight='balanced', random_state=RANDOM_STATE),
    "logistic_l1": LogisticRegression(max_iter=2000, solver='saga', penalty='l1', class_weight='balanced', random_state=RANDOM_STATE),
    "knn": KNeighborsClassifier(n_neighbors=5)
}

# Fit each in a pipeline and evaluate
results = {}
for name, clf in models.items():
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])
    print(f"\nTraining {name} ...")
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    probs = pipe.predict_proba(X_val)[:, 1] if hasattr(pipe, "predict_proba") else None

    f1 = f1_score(y_val, preds)
    prec = precision_score(y_val, preds, zero_division=0)
    rec = recall_score(y_val, preds, zero_division=0)
    auc = roc_auc_score(y_val, probs) if probs is not None else float('nan')

    results[name] = {"pipeline": pipe, "f1": f1, "precision": prec, "recall": rec, "auc": auc}
    print(f"{name} -> F1: {f1:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, AUC: {auc:.4f}")

# Pick best model by F1 on validation
best_name = max(results.keys(), key=lambda k: results[k]['f1'])
best = results[best_name]["pipeline"]
print(f"\nBest model by F1: {best_name} (F1={results[best_name]['f1']:.4f})")


os.makedirs("Output", exist_ok=True)
with open("Output/evaluation_results.txt", "w") as f:
    f.write(f"F1-score: {f1:.4f}\n")

# Fit best model on full training data
print("Refitting best model on full training set...")
best.fit(X, y)

# Predict probabilities for test set
if hasattr(best, "predict_proba"):
    test_probs = best.predict_proba(X_test)[:, 1]
else:
    # fallback to decision_function + minmax
    try:
        scores = best.decision_function(X_test)
        # scale to [0,1]
        test_probs = (scores - scores.min()) / (scores.max() - scores.min())
    except Exception:
        # fallback to binary predict (0/1)
        test_probs = best.predict(X_test)

# Convert probabilities to binary 0/1 for F1-score / submission
test_pred = (test_probs >= 0.5).astype(int)

# Build submission
submission = pd.DataFrame({
    ID_COL: test_ids,
    "TARGET": test_pred  # now 0 or 1
})

# Ensure correct ordering and no index issues
submission = submission.reset_index(drop=True)
OUT_PATH = OUTPUT / "submission4.csv"
submission.to_csv(OUT_PATH, index=False)
print(f"Submission saved to {OUT_PATH}")


Numeric cols: ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
Categorical cols: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

Training logistic_l2 ...
logistic_l2 -> F1: 0.2321, Precision: 0.1357, Recall: 0.8020, AUC: 0.8684

Training logistic_l1 ...
logistic_l1 -> F1: 0.2225, Precision: 0.1292, Recall: 0.8020, AUC: 0.8688

Training knn ...
knn -> F1: 0.0672, Precision: 0.2222, Recall: 0.0396, AUC: 0.6548

Best model by F1: logistic_l2 (F1=0.2321)
Refitting best model on full training set...
Submission saved to /Users/YvanLongin/Google Drive/My Drive/Boston College Applied Econ Program/Big Data Econometrics (Fall 25)/Assignments/HW 2/Output/submission4.csv
