In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Required Models
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("="*70)
print("LOADING DATA")
print("="*70)

# Load full training data (no split for final training)
train_df = pd.read_csv('train1.csv')
test_df = pd.read_csv('test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Separate features and target
X_full = train_df.drop(['target', 'id'], axis=1)
y_full = train_df['target']
X_test_final = test_df.drop(['id'], axis=1)
test_ids = test_df['id']

print(f"\nClass distribution:")
print(y_full.value_counts(normalize=True))


LOADING DATA
Training data shape: (296209, 67)
Test data shape: (126948, 66)

Class distribution:
target
0    0.948732
1    0.051268
Name: proportion, dtype: float64


In [4]:
# ============================================================================
# STEP 2: CREATE VALIDATION SPLIT FOR MODEL COMPARISON
# ============================================================================
print("\n" + "="*70)
print("CREATING VALIDATION SPLIT FOR MODEL EVALUATION")
print("="*70)

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, 
    test_size=0.3, 
    random_state=29173, 
    stratify=y_full
)

print(f"Train set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")



CREATING VALIDATION SPLIT FOR MODEL EVALUATION
Train set: (207346, 65)
Validation set: (88863, 65)


In [5]:
# ============================================================================
# STEP 3: PREPROCESSING FUNCTIONS
# ============================================================================

def identify_feature_types(df):
    """Identify categorical, binary, and numerical features"""
    categorical_cols = [col for col in df.columns if col.endswith('_cat')]
    binary_cols = [col for col in df.columns if col.endswith('_bin')]
    numerical_cols = [col for col in df.columns 
                     if not col.endswith(('_cat', '_bin'))]
    return categorical_cols, binary_cols, numerical_cols


def class_based_imputer(X, y, categorical_columns):
    """Class-based imputation for categorical columns"""
    X_imputed = X.copy()
    
    for col in categorical_columns:
        if X_imputed[col].isnull().sum() == 0:
            continue
        
        for class_val in [0, 1]:
            class_mask = (y == class_val)
            missing_mask = X_imputed[col].isnull()
            impute_mask = class_mask & missing_mask
            
            if impute_mask.sum() > 0:
                class_values = X_imputed.loc[class_mask & ~missing_mask, col]
                if len(class_values) > 0:
                    fill_value = class_values.mode()[0]
                else:
                    fill_value = X_imputed[col].mode()[0] if len(X_imputed[col].mode()) > 0 else 0
                X_imputed.loc[impute_mask, col] = fill_value
    
    return X_imputed


def class_based_num_imputer(X, y, num_cols):
    """Class-based imputation for numerical columns"""
    X_imputed = X.copy()
    
    for col in num_cols:
        if X_imputed[col].isnull().sum() == 0:
            continue
        
        for class_val in [0, 1]:
            class_mask = (y == class_val)
            missing_mask = X_imputed[col].isnull()
            impute_mask = class_mask & missing_mask
            
            if impute_mask.sum() > 0:
                class_values = X_imputed.loc[class_mask & ~missing_mask, col]
                if len(class_values) > 0:
                    fill_value = class_values.mean()
                else:
                    fill_value = X_imputed[col].mean()
                X_imputed.loc[impute_mask, col] = fill_value
    
    return X_imputed


def preprocess_data(X_train, X_val, y_train, y_val, X_test=None):
    """Complete preprocessing pipeline"""
    
    print("\n" + "="*70)
    print("PREPROCESSING DATA")
    print("="*70)
    
    # Identify feature types
    categorical_cols, binary_cols, numerical_cols = identify_feature_types(X_train)
    
    print(f"Categorical features: {len(categorical_cols)}")
    print(f"Binary features: {len(binary_cols)}")
    print(f"Numerical features: {len(numerical_cols)}")
    
    # Convert types
    X_train[categorical_cols] = X_train[categorical_cols].astype('category')
    X_val[categorical_cols] = X_val[categorical_cols].astype('category')
    X_train[binary_cols] = X_train[binary_cols].astype(int)
    X_val[binary_cols] = X_val[binary_cols].astype(int)
    
    if X_test is not None:
        X_test[categorical_cols] = X_test[categorical_cols].astype('category')
        X_test[binary_cols] = X_test[binary_cols].astype(int)
    
    # Impute missing values
    print("\nImputing missing values...")
    X_train = class_based_imputer(X_train, y_train, categorical_cols)
    X_train = class_based_num_imputer(X_train, y_train, numerical_cols)
    
    # For validation and test, use global mode/mean
    for col in categorical_cols:
        if X_val[col].isnull().sum() > 0:
            X_val[col].fillna(X_train[col].mode()[0], inplace=True)
        if X_test is not None and X_test[col].isnull().sum() > 0:
            X_test[col].fillna(X_train[col].mode()[0], inplace=True)
    
    for col in numerical_cols:
        if X_val[col].isnull().sum() > 0:
            X_val[col].fillna(X_train[col].mean(), inplace=True)
        if X_test is not None and X_test[col].isnull().sum() > 0:
            X_test[col].fillna(X_train[col].mean(), inplace=True)
    
    # Convert categorical to numeric codes for models that need it
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()
    X_test_encoded = X_test.copy() if X_test is not None else None
    
    for col in categorical_cols:
        X_train_encoded[col] = X_train_encoded[col].cat.codes
        X_val_encoded[col] = X_val_encoded[col].cat.codes
        if X_test_encoded is not None:
            X_test_encoded[col] = X_test_encoded[col].cat.codes
    
    print("Preprocessing complete!")
    
    return (X_train_encoded, X_val_encoded, X_test_encoded, 
            categorical_cols, binary_cols, numerical_cols)

In [6]:
# ============================================================================
# STEP 4: PREPROCESS ALL DATA
# ============================================================================

X_train_proc, X_val_proc, X_test_proc, cat_cols, bin_cols, num_cols = preprocess_data(
    X_train.copy(), X_val.copy(), y_train, y_val, X_test_final.copy()
)


PREPROCESSING DATA
Categorical features: 14
Binary features: 17
Numerical features: 34

Imputing missing values...
Preprocessing complete!


In [7]:
# ============================================================================
# STEP 5: DEFINE AND TRAIN MODELS
# ============================================================================

print("\n" + "="*70)
print("TRAINING AND EVALUATING MODELS")
print("="*70)

results = []

# -------------------------
# 1. CATEGORICAL NAIVE BAYES
# -------------------------
print("\n[1/5] Categorical Naive Bayes")
print("-" * 50)

# For CategoricalNB, ensure all features are non-negative integers
X_train_nb = X_train_proc.copy()
X_val_nb = X_val_proc.copy()

# Make all values non-negative
for col in X_train_nb.columns:
    min_val = X_train_nb[col].min()
    if min_val < 0:
        X_train_nb[col] = X_train_nb[col] - min_val
        X_val_nb[col] = X_val_nb[col] - min_val

try:
    nb_model = CategoricalNB()
    nb_model.fit(X_train_nb, y_train)
    y_pred_nb = nb_model.predict_proba(X_val_nb)[:, 1]
    auroc_nb = roc_auc_score(y_val, y_pred_nb)
    print(f"‚úì Validation AUROC: {auroc_nb:.4f}")
    results.append({
        'Model': 'Categorical Naive Bayes',
        'AUROC': auroc_nb,
        'Notes': 'Baseline probabilistic model'
    })
except Exception as e:
    print(f"‚úó Error: {e}")
    results.append({
        'Model': 'Categorical Naive Bayes',
        'AUROC': None,
        'Notes': f'Failed: {str(e)[:50]}'
    })

# -------------------------
# 2. K-NEAREST NEIGHBORS
# -------------------------
print("\n[2/5] K-Nearest Neighbors")
print("-" * 50)

# KNN requires scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_proc)
X_val_scaled = scaler.transform(X_val_proc)

# Try different k values
best_knn_auroc = 0
best_k = 5

for k in [3, 5, 7, 10, 15]:
    knn_model = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn_model.fit(X_train_scaled, y_train)
    y_pred_knn = knn_model.predict_proba(X_val_scaled)[:, 1]
    auroc_knn = roc_auc_score(y_val, y_pred_knn)
    print(f"  k={k}: AUROC = {auroc_knn:.4f}")
    
    if auroc_knn > best_knn_auroc:
        best_knn_auroc = auroc_knn
        best_k = k

print(f"‚úì Best k={best_k}, Validation AUROC: {best_knn_auroc:.4f}")
results.append({
    'Model': f'KNN (k={best_k})',
    'AUROC': best_knn_auroc,
    'Notes': 'With StandardScaler'
})

# -------------------------
# 3. DECISION TREE
# -------------------------
print("\n[3/5] Decision Tree")
print("-" * 50)

# Use best parameters from Task 1
dt_model = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=1,
    criterion='entropy',
    random_state=29173
)
dt_model.fit(X_train_proc, y_train)
y_pred_dt = dt_model.predict_proba(X_val_proc)[:, 1]
auroc_dt = roc_auc_score(y_val, y_pred_dt)
print(f"‚úì Validation AUROC: {auroc_dt:.4f}")
results.append({
    'Model': 'Decision Tree (Tuned)',
    'AUROC': auroc_dt,
    'Notes': 'max_depth=5, entropy criterion'
})

# -------------------------
# 4. RANDOM FOREST
# -------------------------
print("\n[4/5] Random Forest")
print("-" * 50)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=29173,
    n_jobs=-1,
    class_weight='balanced'
)
print("Training Random Forest (this may take a few minutes)...")
rf_model.fit(X_train_proc, y_train)
y_pred_rf = rf_model.predict_proba(X_val_proc)[:, 1]
auroc_rf = roc_auc_score(y_val, y_pred_rf)
print(f"‚úì Validation AUROC: {auroc_rf:.4f}")
results.append({
    'Model': 'Random Forest',
    'AUROC': auroc_rf,
    'Notes': '100 trees, balanced weights'
})

# -------------------------
# 5. ADABOOST
# -------------------------
print("\n[5/5] AdaBoost")
print("-" * 50)

# Use decision tree as base estimator
base_dt = DecisionTreeClassifier(max_depth=3, random_state=29173)
ada_model = AdaBoostClassifier(
    base_estimator=base_dt,
    n_estimators=50,
    learning_rate=1.0,
    random_state=29173
)
print("Training AdaBoost...")
ada_model.fit(X_train_proc, y_train)
y_pred_ada = ada_model.predict_proba(X_val_proc)[:, 1]
auroc_ada = roc_auc_score(y_val, y_pred_ada)
print(f"‚úì Validation AUROC: {auroc_ada:.4f}")
results.append({
    'Model': 'AdaBoost',
    'AUROC': auroc_ada,
    'Notes': '50 estimators, base_depth=3'
})

# ============================================================================
# STEP 6: RESULTS COMPARISON
# ============================================================================

print("\n" + "="*70)
print("MODEL COMPARISON RESULTS")
print("="*70)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('AUROC', ascending=False)
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
best_auroc = results_df.iloc[0]['AUROC']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Validation AUROC: {best_auroc:.4f}")

# ============================================================================
# STEP 7: TRAIN BEST MODEL ON FULL DATA AND GENERATE PREDICTIONS
# ============================================================================

print("\n" + "="*70)
print("TRAINING BEST MODEL ON FULL DATASET")
print("="*70)

# Preprocess full data
X_full_proc = X_full.copy()
categorical_cols_full, binary_cols_full, numerical_cols_full = identify_feature_types(X_full_proc)

X_full_proc[categorical_cols_full] = X_full_proc[categorical_cols_full].astype('category')
X_full_proc[binary_cols_full] = X_full_proc[binary_cols_full].astype(int)
X_full_proc = class_based_imputer(X_full_proc, y_full, categorical_cols_full)
X_full_proc = class_based_num_imputer(X_full_proc, y_full, numerical_cols_full)

for col in categorical_cols_full:
    X_full_proc[col] = X_full_proc[col].cat.codes

# Train best model (assuming Random Forest or AdaBoost won)
print(f"Training {best_model_name} on full training data...")

if 'Random Forest' in best_model_name:
    final_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        random_state=29173,
        n_jobs=-1,
        class_weight='balanced'
    )
elif 'AdaBoost' in best_model_name:
    base_dt = DecisionTreeClassifier(max_depth=3, random_state=29173)
    final_model = AdaBoostClassifier(
        base_estimator=base_dt,
        n_estimators=50,
        learning_rate=1.0,
        random_state=29173
    )
else:  # Decision Tree or others
    final_model = DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=1,
        criterion='entropy',
        random_state=29173
    )

final_model.fit(X_full_proc, y_full)

# Generate predictions on test set
print("\nGenerating predictions on test set...")
y_test_pred = final_model.predict_proba(X_test_proc)[:, 1]

# ============================================================================
# STEP 8: CREATE SUBMISSION FILE
# ============================================================================

print("\n" + "="*70)
print("CREATING SUBMISSION FILE")
print("="*70)

submission = pd.DataFrame({
    'id': test_ids,
    'target': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("‚úì Submission file created: submission.csv")
print(f"  Shape: {submission.shape}")
print(f"  Sample predictions:")
print(submission.head(10))

print("\n" + "="*70)
print("TASK 2 COMPLETE!")
print("="*70)
print("\nNext steps:")
print("1. Upload 'submission.csv' to Kaggle")
print("2. Record your public leaderboard score")
print("3. Complete the Task 2 report with:")
print("   - Model comparison table (printed above)")
print("   - Kaggle public leaderboard AUROC")
print("   - Analysis of best model performance")


TRAINING AND EVALUATING MODELS

[1/5] Categorical Naive Bayes
--------------------------------------------------
‚úó Error: index 10 is out of bounds for axis 1 with size 10

[2/5] K-Nearest Neighbors
--------------------------------------------------
  k=3: AUROC = 0.5105
  k=5: AUROC = 0.5165
  k=7: AUROC = 0.5220
  k=10: AUROC = 0.5272
  k=15: AUROC = 0.5348
‚úì Best k=15, Validation AUROC: 0.5348

[3/5] Decision Tree
--------------------------------------------------
‚úì Validation AUROC: 0.5490

[4/5] Random Forest
--------------------------------------------------
Training Random Forest (this may take a few minutes)...
‚úì Validation AUROC: 0.5904

[5/5] AdaBoost
--------------------------------------------------


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [8]:
# ============================================================================ 
# STEP 9: DEFINE HYPERPARAMETER GRIDS
# ============================================================================

# 1. CATEGORICAL NAIVE BAYES
cat_nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0],       # smoothing parameter
    'fit_prior': [True, False]
}

# 2. K-NEAREST NEIGHBORS
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# 3. DECISION TREE
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# 4. RANDOM FOREST
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', None]
}

# 5. ADABOOST
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'base_estimator__max_depth': [1, 2, 3, 5]  # base tree depth
}

print("Hyperparameter grids created for all models!")


Hyperparameter grids created for all models!


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Store results
grid_results = []

In [16]:
print("\n[1/5] Grid Search: Categorical Naive Bayes")
cat_nb = GaussianNB()
grid_search_nb = GridSearchCV(
    estimator=cat_nb,
    param_grid=cat_nb_param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
grid_search_nb.fit(X_train_nb, y_train)
grid_results.append({
    'Model': 'Categorical Naive Bayes',
    'Best AUROC': grid_search_nb.best_score_,
    'Best Params': grid_search_nb.best_params_
})
print(f"‚úì Best Params: {grid_search_nb.best_params_}, AUROC: {grid_search_nb.best_score_:.4f}")



[1/5] Grid Search: Categorical Naive Bayes


ValueError: Invalid parameter 'alpha' for estimator GaussianNB(). Valid parameters are: ['priors', 'var_smoothing'].

In [18]:
X_train_scaled.shape

(207346, 65)

In [17]:
# -------------------------
# 2. K-NEAREST NEIGHBORS
# -------------------------
print("\n[2/5] Grid Search: KNN")
knn = KNeighborsClassifier(n_jobs=-1)
grid_search_knn = GridSearchCV(
    estimator=knn,
    param_grid=knn_param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
grid_search_knn.fit(X_train_scaled, y_train)
grid_results.append({
    'Model': 'KNN',
    'Best AUROC': grid_search_knn.best_score_,
    'Best Params': grid_search_knn.best_params_
})
print(f"‚úì Best Params: {grid_search_knn.best_params_}, AUROC: {grid_search_knn.best_score_:.4f}")



[2/5] Grid Search: KNN


KeyboardInterrupt: 

In [None]:
print("\n[3/5] Grid Search: Decision Tree")
dt = DecisionTreeClassifier(random_state=29173)
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=dt_param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
grid_search_dt.fit(X_train_proc, y_train)
grid_results.append({
    'Model': 'Decision Tree',
    'Best AUROC': grid_search_dt.best_score_,
    'Best Params': grid_search_dt.best_params_
})
print(f"‚úì Best Params: {grid_search_dt.best_params_}, AUROC: {grid_search_dt.best_score_:.4f}")


In [19]:
# -------------------------
# 4. RANDOM FOREST
# -------------------------
print("\n[4/5] Grid Search: Random Forest")
rf = RandomForestClassifier(random_state=29173, n_jobs=-1)
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
grid_search_rf.fit(X_train_proc, y_train)
grid_results.append({
    'Model': 'Random Forest',
    'Best AUROC': grid_search_rf.best_score_,
    'Best Params': grid_search_rf.best_params_
})
print(f"‚úì Best Params: {grid_search_rf.best_params_}, AUROC: {grid_search_rf.best_score_:.4f}")



[4/5] Grid Search: Random Forest


KeyboardInterrupt: 

In [None]:
print("\n[5/5] Grid Search: AdaBoost")
base_dt = DecisionTreeClassifier(random_state=29173)
ada = AdaBoostClassifier(base_estimator=base_dt, random_state=29173)
grid_search_ada = GridSearchCV(
    estimator=ada,
    param_grid=ada_param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
grid_search_ada.fit(X_train_proc, y_train)
grid_results.append({
    'Model': 'AdaBoost',
    'Best AUROC': grid_search_ada.best_score_,
    'Best Params': grid_search_ada.best_params_
})
print(f"‚úì Best Params: {grid_search_ada.best_params_}, AUROC: {grid_search_ada.best_score_:.4f}")



