In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd


In [None]:
def stacking_with_kfold(X_train, Y_train, X_test, Y_test, models, n_folds=5):
    meta_model = LogisticRegression()

    meta_X_train = np.zeros((X_train.shape[0], len(models)))
    meta_X_test = np.zeros((X_test.shape[0], len(models)))

    metrics = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=47)
    for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
        print(f"Fold {fold+1}/{n_folds}")

        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        Y_train_fold, Y_val_fold = Y_train.iloc[train_index], Y_train.iloc[val_index]

        for i, model in enumerate(models):
            model.fit(X_train_fold, Y_train_fold)
            meta_X_train[val_index, i] = model.predict_proba(X_val_fold)[:, 1]
        for i, (model, name) in enumerate(zip(models, [type(model).__name__ for model in models])):
            y_val_pred = model.predict(X_val_fold)
            acc = accuracy_score(Y_val_fold, y_val_pred)
            prec = precision_score(Y_val_fold, y_val_pred)
            rec = recall_score(Y_val_fold, y_val_pred)
            f1 = f1_score(Y_val_fold, y_val_pred)
            metrics = metrics.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1}, ignore_index=True)
            print(f"{name} Validation Metrics: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

    meta_model.fit(meta_X_train, Y_train)

    for i, model in enumerate(models):
        meta_X_test[:, i] = model.predict_proba(X_test)[:, 1]

    y_test_pred = meta_model.predict(meta_X_test)

    acc = accuracy_score(Y_test, y_test_pred)
    prec = precision_score(Y_test, y_test_pred)
    rec = recall_score(Y_test, y_test_pred)
    f1 = f1_score(Y_test, y_test_pred)
    metrics = metrics.append({'Model': 'Stacked Logistic Regression', 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1}, ignore_index=True)
    print(f"Stacked Logistic Regression Test Metrics: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")

    return metrics

Possible Creative points:
1. 2 layers of stacking
2. unsupervised learning on predictions


# ML Project Structure Guidelines

## File Structure

1. **data/**
   - `raw/`: Contains the raw, unprocessed data files.
   - `cleaned/`: Stores the cleaned and preprocessed data files.
   - `processed/`: Holds the final, transformed data files ready for modeling.

2. **notebooks/**
   - `01_data_exploration.ipynb`: Exploratory data analysis, data visualization, and initial insights.
   - `02_data_cleaning.ipynb`: Data cleaning, handling missing values, and other preprocessing steps.
   - `03_feature_engineering.ipynb`: Feature engineering, including creating new features, transforming existing features, and feature selection.
   - `04_model_training.ipynb`: Model training, hyperparameter tuning, and evaluation.
   - `05_model_deployment.ipynb`: Saving the trained model, creating a deployment-ready artifact, and writing inference scripts.

3. **src/**
   - `utils.py`: Utility functions used across the project, such as data loading, data transformation, and model evaluation.
   - `models.py`: Custom model definitions and training/inference functions.

4. **requirements.txt**: A file listing the Python dependencies required for the project.
5. **README.md**: A detailed project overview, instructions for setup, and usage guidelines.

## Workflow

1. **Data Exploration**: In `01_data_exploration.ipynb`, perform your initial exploratory data analysis (EDA), including data visualization and gaining initial insights about the dataset. Save the cleaned and preprocessed data in the `cleaned/` directory of the `data/` folder.

2. **Data Cleaning**: In `02_data_cleaning.ipynb`, handle missing values, remove outliers, and perform other data cleaning tasks. Save the cleaned data in the `cleaned/` directory.

3. **Feature Engineering**: In `03_feature_engineering.ipynb`, create new features, transform existing features, and perform feature selection. Save the final, transformed data in the `processed/` directory.

4. **Model Training**: In `04_model_training.ipynb`, load the processed data from the `processed/` directory, train your machine learning models, and evaluate their performance.

5. **Model Deployment**: In `05_model_deployment.ipynb`, save the trained model, create a deployment-ready artifact, and write inference scripts to use the model in a production environment.

Throughout the project, utilize the utility functions in `utils.py` and the custom model definitions in `models.py` to ensure consistency and reusability of your code.