In [None]:
## Logistic regression w

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# ----------------------------
# 1. Load datasets
# ----------------------------
app_train = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\application_train.csv")
app_test = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\application_test.csv")
bureau = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\bureau.csv")

# ----------------------------
# 2. Aggregate bureau features
# ----------------------------
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['mean', 'max', 'sum'],
    'DAYS_CREDIT': ['min', 'mean'],
    'CREDIT_ACTIVE': lambda x: (x == 'Active').sum(),
    'CREDIT_TYPE': 'nunique'
})
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]
bureau_agg.reset_index(inplace=True)

# ----------------------------
# 3. Merge with train/test
# ----------------------------
train_merged = app_train.merge(bureau_agg, on='SK_ID_CURR', how='left')
test_merged  = app_test.merge(bureau_agg, on='SK_ID_CURR', how='left')

# ----------------------------
# 4. Separate numeric and categorical columns
# ----------------------------
numeric_cols = train_merged.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_merged.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove TARGET from numeric list
numeric_cols = [col for col in numeric_cols if col != 'TARGET']

# ----------------------------
# 5. Fill missing values
# ----------------------------
# Numeric → median
train_merged[numeric_cols] = train_merged[numeric_cols].fillna(train_merged[numeric_cols].median())
test_merged[numeric_cols]  = test_merged[numeric_cols].fillna(train_merged[numeric_cols].median())

# Categorical → mode
for col in categorical_cols:
    mode_val = train_merged[col].mode()[0]
    train_merged[col] = train_merged[col].fillna(mode_val)
    test_merged[col]  = test_merged[col].fillna(mode_val)

# ----------------------------
# 6. One-hot encode categorical features
# ----------------------------
train_encoded = pd.get_dummies(train_merged[categorical_cols], drop_first=True)
test_encoded  = pd.get_dummies(test_merged[categorical_cols], drop_first=True)

# Align columns so train and test match
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# ----------------------------
# 7. Combine numeric + categorical
# ----------------------------
X = pd.concat([train_merged[numeric_cols], train_encoded], axis=1)
y = train_merged['TARGET']
X_test = pd.concat([test_merged[numeric_cols], test_encoded], axis=1)

# ----------------------------
# 8. Scale numeric features only
# ----------------------------
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Final train shape:", X.shape)
print("Final test shape:", X_test.shape)

# ----------------------------
# 9. Train/Validation Split
# ----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 10. Logistic Regression Model
# ----------------------------
model = LogisticRegression(max_iter=5000, solver='saga', C=0.5, n_jobs=-1)
model.fit(X_train, y_train)

# ----------------------------
# 11. Standard Metrics on Validation
# ----------------------------
y_valid_pred = model.predict(X_valid)
y_valid_proba = model.predict_proba(X_valid)[:,1]

print("Validation Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("Validation AUC:", roc_auc_score(y_valid, y_valid_proba))
print("Validation Log Loss:", log_loss(y_valid, y_valid_proba))

# ----------------------------
# 12. Cross-Validation AUC
# ----------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
print("\nCross-validation AUC scores:", cv_scores)
print("Mean CV AUC:", cv_scores.mean())

# ----------------------------
# 13. Kaggle Submission File
# ----------------------------
test_preds = model.predict_proba(X_test)[:,1]

submission = pd.DataFrame({
    'SK_ID_CURR': test_merged['SK_ID_CURR'],
    'TARGET': test_preds
})

submission.to_csv("submission.csv", index=False)
print("\nSubmission file created: submission.csv")


Final train shape: (307511, 236)
Final test shape: (48744, 236)
Validation Accuracy: 0.919337268100743
Validation AUC: 0.7501270825810031
Validation Log Loss: 0.2486389836633079

Cross-validation AUC scores: [0.74204458 0.75116269 0.7471765  0.75109602 0.74183978]
Mean CV AUC: 0.7466639105813014

Submission file created: submission.csv


In [None]:
the true benchmark is:

Accuracy ≈ 91.9%

AUC = 0.5

Model beats the benchmark in the metric that matters most here: AUC.

## Logistic regression with scaler and targeted interactions

In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# --- Starting point: build X and y from your merged data ---
# ----------------------------
# 1. Load datasets
# ----------------------------
app_train = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\application_train.csv")
app_test = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\application_test.csv")
bureau = pd.read_csv(r"C:\Users\sina_\OneDrive\Documents\University of utah cybersecurity\capstone 2\bureau.csv")

# ----------------------------
# 2. Aggregate bureau features
# ----------------------------
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['mean', 'max', 'sum'],
    'DAYS_CREDIT': ['min', 'mean'],
    'CREDIT_ACTIVE': lambda x: (x == 'Active').sum(),
    'CREDIT_TYPE': 'nunique'
})
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]
bureau_agg.reset_index(inplace=True)

# ----------------------------
# 3. Merge with train/test
# ----------------------------
train_merged = app_train.merge(bureau_agg, on='SK_ID_CURR', how='left')
test_merged  = app_test.merge(bureau_agg, on='SK_ID_CURR', how='left')

# If you already have train_merged and test_merged (with bureau aggregations), run the following prep.

# Separate columns
numeric_cols = train_merged.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'TARGET']
categorical_cols = train_merged.select_dtypes(exclude=[np.number]).columns.tolist()

# Impute
train_merged[numeric_cols] = train_merged[numeric_cols].fillna(train_merged[numeric_cols].median())
test_merged[numeric_cols]  = test_merged[numeric_cols].fillna(train_merged[numeric_cols].median())

for c in categorical_cols:
    mode_val = train_merged[c].mode()[0]
    train_merged[c] = train_merged[c].fillna(mode_val)
    test_merged[c]  = test_merged[c].fillna(mode_val)

# One-hot encode categorical
train_encoded = pd.get_dummies(train_merged[categorical_cols], drop_first=True)
test_encoded  = pd.get_dummies(test_merged[categorical_cols], drop_first=True)
train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# Feature matrices
X_num = train_merged[numeric_cols].copy()
X_cat = train_encoded.copy()
y = train_merged['TARGET'].copy()

X_num_test = test_merged[numeric_cols].copy()
X_cat_test = test_encoded.copy()

# --- Utility: evaluate a feature matrix with scaling on numeric subset ---
def evaluate_logistic(X_df, y_vec, numeric_feature_names, model_kwargs=None, random_state=42):
    if model_kwargs is None:
        model_kwargs = dict(max_iter=5000, solver='saga', C=0.5, n_jobs=-1)

    # Train/valid split (stratified)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_df, y_vec, test_size=0.2, random_state=random_state, stratify=y_vec
    )

    # Scale numeric columns only
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_valid_scaled = X_valid.copy()
    if len(numeric_feature_names) > 0:
        X_train_scaled[numeric_feature_names] = scaler.fit_transform(X_train[numeric_feature_names])
        X_valid_scaled[numeric_feature_names] = scaler.transform(X_valid[numeric_feature_names])

    # Fit logistic regression
    model = LogisticRegression(**model_kwargs)
    model.fit(X_train_scaled, y_train)

    # Metrics
    y_valid_pred = model.predict(X_valid_scaled)
    y_valid_proba = model.predict_proba(X_valid_scaled)[:, 1]
    acc = accuracy_score(y_valid, y_valid_pred)
    auc = roc_auc_score(y_valid, y_valid_proba)
    return acc, auc, model, scaler

# --- 1) Baseline numeric-only model ---
X_baseline = X_num.copy()
acc_base, auc_base, model_base, scaler_base = evaluate_logistic(X_baseline, y, numeric_cols)
print(f"Baseline numeric-only -> Accuracy: {acc_base:.6f} | AUC: {auc_base:.6f}")

# --- 2) Numeric + categorical (full) ---
X_full = pd.concat([X_num, X_cat], axis=1)
acc_full, auc_full, model_full, scaler_full = evaluate_logistic(X_full, y, numeric_cols)
print(f"Numeric + categorical -> Accuracy: {acc_full:.6f} | AUC: {auc_full:.6f}")

# --- 3) Add targeted interaction terms ---
# Choose a small set of informative numeric features for interactions (customize as needed).
# Pick stable, non-sparse features to avoid blow-up. Examples (adjust to your column names):
selected_numeric_for_interactions = [
    c for c in numeric_cols
    if c in [
        'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL',
        'DAYS_BIRTH', 'DAYS_EMPLOYED',
        'AMT_CREDIT_SUM_mean', 'AMT_CREDIT_SUM_sum'
    ] and c in X_num.columns
]

# Build polynomial (interaction-only) features on the selected numeric subset
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_inter_num = X_num[selected_numeric_for_interactions].copy()
X_inter_arr = poly.fit_transform(X_inter_num.values)
inter_feature_names = poly.get_feature_names_out(selected_numeric_for_interactions)

X_inter_df = pd.DataFrame(X_inter_arr, index=X_num.index, columns=inter_feature_names)

# Combine: original numeric + interactions + categorical
X_with_interactions = pd.concat([X_num, X_cat, X_inter_df], axis=1)

# Evaluate
# Numeric columns for scaling include the original numeric plus the interaction columns (all are numeric)
numeric_cols_with_interactions = list(X_num.columns) + list(X_inter_df.columns)
acc_inter, auc_inter, model_inter, scaler_inter = evaluate_logistic(
    X_with_interactions, y, numeric_cols_with_interactions
)
print(f"With targeted interactions -> Accuracy: {acc_inter:.6f} | AUC: {auc_inter:.6f}")

# --- Optional: simple ranked printout ---
results = [1`
    ("Baseline numeric-only", acc_base, auc_base),
    ("Numeric + categorical", acc_full, auc_full),
    ("With targeted interactions", acc_inter, auc_inter),
]
print("\nModel comparison (sorted by AUC):")
for name, acc, auc in sorted(results, key=lambda r: r[2], reverse=True):
    print(f"{name:28s} | Acc: {acc:.6f} | AUC: {auc:.6f}")


Baseline numeric-only -> Accuracy: 0.919272 | AUC: 0.737980
Numeric + categorical -> Accuracy: 0.919386 | AUC: 0.750191
With targeted interactions -> Accuracy: 0.919370 | AUC: 0.750895

Model comparison (sorted by AUC):
With targeted interactions   | Acc: 0.919370 | AUC: 0.750895
Numeric + categorical        | Acc: 0.919386 | AUC: 0.750191
Baseline numeric-only        | Acc: 0.919272 | AUC: 0.737980


## Using the baseline Logistic Regression, Random Forest and Gradient Boosting

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

X = pd.concat([X_num, X_cat], axis=1)
y = train_merged['TARGET']

# --- Use your prepared X (features) and y (TARGET) ---
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

def evaluate_model(model, X_train, y_train, X_valid, y_valid, name="Model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    y_proba = model.predict_proba(X_valid)[:,1]
    acc = accuracy_score(y_valid, y_pred)
    auc = roc_auc_score(y_valid, y_proba)
    print(f"{name:25s} | Accuracy: {acc:.6f} | AUC: {auc:.6f}")
    return acc, auc

# Logistic Regression (baseline)
log_reg = LogisticRegression(max_iter=5000, solver='saga', C=0.5, n_jobs=-1)
evaluate_model(log_reg, X_train, y_train, X_valid, y_valid, "Logistic Regression")

# Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=50,
    n_jobs=-1,
    random_state=42
)
evaluate_model(rf, X_train, y_train, X_valid, y_valid, "Random Forest")

# Gradient Boosting
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)
evaluate_model(gb, X_train, y_train, X_valid, y_valid, "Gradient Boosting")


Logistic Regression       | Accuracy: 0.919272 | AUC: 0.549845
Random Forest             | Accuracy: 0.919272 | AUC: 0.742343
Gradient Boosting         | Accuracy: 0.919760 | AUC: 0.758034


(0.919760011706746, 0.7580335011250177)

In [None]:
### Perform the data transformations required by a given algorithm.  For example, some algorithms require numeric data and perform better when it has been standardized or normalized

## Using the downsampling and upsampling methods

In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# --- Split train/valid ---
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Upsampling ---
ros = RandomOverSampler(random_state=42)
X_train_up, y_train_up = ros.fit_resample(X_train, y_train)

# --- Downsampling ---
rus = RandomUnderSampler(random_state=42)
X_train_down, y_train_down = rus.fit_resample(X_train, y_train)

# --- Define scaler ---
scaler = StandardScaler()

# --- Baseline (no resampling, but scaled) ---
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train_scaled, y_train)
auc_base = roc_auc_score(y_valid, log_reg.predict_proba(X_valid_scaled)[:,1])
acc_base = accuracy_score(y_valid, log_reg.predict(X_valid_scaled))

# --- Upsampled + Scaled ---
X_train_up_scaled = scaler.fit_transform(X_train_up)
X_valid_scaled = scaler.transform(X_valid)   # always transform valid with same scaler

log_reg.fit(X_train_up_scaled, y_train_up)
auc_up = roc_auc_score(y_valid, log_reg.predict_proba(X_valid_scaled)[:,1])
acc_up = accuracy_score(y_valid, log_reg.predict(X_valid_scaled))

# --- Downsampled + Scaled ---
X_train_down_scaled = scaler.fit_transform(X_train_down)
X_valid_scaled = scaler.transform(X_valid)

log_reg.fit(X_train_down_scaled, y_train_down)
auc_down = roc_auc_score(y_valid, log_reg.predict_proba(X_valid_scaled)[:,1])
acc_down = accuracy_score(y_valid, log_reg.predict(X_valid_scaled))

# --- Print results ---
print(f"Baseline (scaled)    -> Accuracy: {acc_base:.4f} | AUC: {auc_base:.4f}")
print(f"Upsampled + Scaled   -> Accuracy: {acc_up:.4f} | AUC: {auc_up:.4f}")
print(f"Downsampled + Scaled -> Accuracy: {acc_down:.4f} | AUC: {auc_down:.4f}")


Baseline (scaled)    -> Accuracy: 0.9194 | AUC: 0.7502
Upsampled + Scaled   -> Accuracy: 0.6912 | AUC: 0.7502
Downsampled + Scaled -> Accuracy: 0.6892 | AUC: 0.7483


## Using the Ensemble method to find AUC

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Define base models
log_reg = LogisticRegression(max_iter=5000, class_weight='balanced')
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, random_state=42)

# Voting ensemble (soft = average predicted probabilities)
ensemble = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('gb', gb)],
    voting='soft'
)

# Fit on training data
ensemble.fit(X_train, y_train)

# Evaluate
y_proba = ensemble.predict_proba(X_valid)[:,1]
auc_ensemble = roc_auc_score(y_valid, y_proba)

print(f"Ensemble AUC: {auc_ensemble:.4f}")


Ensemble AUC: 0.7570
