In [1]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import pytest
from ktools.models import LGBMModel
from ktools.preprocessing.categorical import CategoricalEncoder
from ktools.preprocessing.numerical import StandardScale
from ktools.preprocessing.pipe import PreprocessingPipeline
from ktools.config.dataset import DatasetConfig
from ktools.fitting.pipe import ModelPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

 __  ___ .___________.  ______     ______    __          _______.
|  |/  / |           | /  __  \   /  __  \  |  |        /       |
|  '  /  `---|  |----`|  |  |  | |  |  |  | |  |       |   (----`
|    <       |  |     |  |  |  | |  |  |  | |  |        \   \    
|  .  \      |  |     |  `--'  | |  `--'  | |  `----.----)   |   
|__|\__\     |__|      \______/   \______/  |_______|_______/    
                                                                 



In [2]:
DATA_PATH = Path("./data/diabetes_prediction/")
TARGET = "diagnosed_diabetes"

# id split
split_id = 678000

train_data = pd.read_csv(DATA_PATH / "train.csv", index_col=0)
test_data = pd.read_csv(DATA_PATH / "test.csv", index_col=0)
train_data, val_data = train_data.iloc[:split_id], train_data.iloc[split_id:]

training_col_names = train_data.drop(columns=TARGET).columns.tolist()

numerical_col_names = (
    train_data.drop(columns=TARGET)
    .select_dtypes(include=["number"])
    .columns.tolist()
)
categorical_col_names = train_data.select_dtypes(
    include=["object"]
).columns.tolist()

config = DatasetConfig(
    training_col_names=training_col_names,
    numerical_col_names=numerical_col_names,
    categorical_col_names=categorical_col_names,
    target_col_name=TARGET,
)

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

preprocessors = [StandardScale(config), CategoricalEncoder(config)]

train_oof_preds = np.empty(train_data.shape[0])
test_oof_preds = np.zeros(test_data.shape[0])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in kfold.split(
    train_data, train_data[TARGET]
):
    train_fold = train_data.iloc[train_index]
    val_fold = train_data.iloc[val_index]
    weights = np.array([1.0] * train_fold.shape[0] + [1.0] * val_data.shape[0])
    train_fold = pd.concat([train_fold, val_data], axis=0)

    assert train_fold.shape[0] == weights.shape[0], f"train_fold shape: {train_fold.shape[0]}, weights shape: {weights.shape[0]}"

    pipe = ModelPipeline(
        model=LGBMModel(),
        config=config,
        preprocessor=PreprocessingPipeline(preprocessors=preprocessors),
    )

    pipe.fit(train_fold, validation_data=val_fold, weights=weights)
    y_pred = pipe.predict(val_fold)
    test_pred = pipe.predict(test_data)

    score = roc_auc_score(val_fold[TARGET], y_pred)
    train_oof_preds[val_index] = y_pred
    test_oof_preds += test_pred / kfold.n_splits

    print(f"Fold ROC AUC Score: {score}")



Fold ROC AUC Score: 0.7073390350508415




Fold ROC AUC Score: 0.7048167044650171




Fold ROC AUC Score: 0.7092384695699288




Fold ROC AUC Score: 0.7038544759312703




Fold ROC AUC Score: 0.7068799515089901


In [5]:
sub_name = f"submissions/diabetes_prediction_21-12-25_with_valdatax20_datasubmission.csv"

sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = test_oof_preds
sample_sub.to_csv(sub_name)