In [1]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
import pytest
from ktools.models import LGBMModel
from ktools.preprocessing.categorical import CategoricalEncoder
from ktools.preprocessing.numerical import StandardScale
from ktools.preprocessing.pipe import PreprocessingPipeline
from ktools.config.dataset import DatasetConfig
from ktools.fitting.pipe import ModelPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

 __  ___ .___________.  ______     ______    __          _______.
|  |/  / |           | /  __  \   /  __  \  |  |        /       |
|  '  /  `---|  |----`|  |  |  | |  |  |  | |  |       |   (----`
|    <       |  |     |  |  |  | |  |  |  | |  |        \   \    
|  .  \      |  |     |  `--'  | |  `--'  | |  `----.----)   |   
|__|\__\     |__|      \______/   \______/  |_______|_______/    
                                                                 



In [2]:
DATA_PATH = Path("./data/diabetes_prediction/")
TARGET = "diagnosed_diabetes"

# id split
split_id = 678000

In [3]:
orig_data = pd.read_csv(DATA_PATH / "original.csv")
train_data = pd.read_csv(DATA_PATH / "train.csv", index_col=0)
test_data = pd.read_csv(DATA_PATH / "test.csv", index_col=0).assign(data=0)

test_data["data"] = test_data["data"].astype("category")
# orig_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)
# train_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)
# test_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)

In [4]:
assert (train_data.columns == train_data.columns.intersection(orig_data.columns)).all()

In [5]:
orig_data = orig_data.drop(columns=orig_data.columns.difference(train_data.columns).to_list())
orig_data = orig_data.assign(data=2)
train_data = train_data.assign(data=np.nan)
train_data.iloc[:split_id, -1] = 1
train_data.iloc[split_id:, -1] = 0

In [6]:
training_col_names = train_data.drop(columns=TARGET).columns.tolist()

numerical_col_names = (
    train_data.drop(columns=TARGET)
    .select_dtypes(include=["number"])
    .columns.tolist()
)
categorical_col_names = train_data.select_dtypes(
    include=["object"]
).columns.tolist()

config = DatasetConfig(
    training_col_names=training_col_names,
    numerical_col_names=numerical_col_names,
    categorical_col_names=categorical_col_names,
    target_col_name=TARGET,
)

In [7]:
categories_of_interest  = train_data["diagnosed_diabetes"].astype(str) + "_" + train_data["data"].astype(str)

In [8]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

preprocessors = [StandardScale(config), CategoricalEncoder(config)]

train_oof_preds = np.empty(train_data.shape[0])
test_oof_preds = np.zeros(test_data.shape[0])

mean_score: float = 0.0
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in kfold.split(
    train_data, categories_of_interest
):
    train_fold: pd.DataFrame = train_data.iloc[train_index].copy()
    val_fold: pd.DataFrame = train_data.iloc[val_index]
    val_fold = val_fold[val_fold["data"] == 0.0]
    # weights = np.array([1.0] * train_fold.shape[0] + [1.0] * val_data.shape[0])
    train_fold["data"] = train_fold["data"].astype("category")

    weights = np.where(train_fold["data"] == 0, 1.5, 1.0)
    # assert train_fold.shape[0] == weights.shape[0], f"train_fold shape: {train_fold.shape[0]}, weights shape: {weights.shape[0]}"

    pipe = ModelPipeline(
        model=LGBMModel(),
        config=config,
        preprocessor=PreprocessingPipeline(preprocessors=preprocessors),
    )

    pipe.fit(train_fold, validation_data=val_fold, weights=weights)
    y_pred = pipe.predict(val_fold)
    test_pred = pipe.predict(test_data)

    score = roc_auc_score(val_fold[TARGET], y_pred)
    # train_oof_preds[val_index] = y_pred
    test_oof_preds += test_pred / kfold.n_splits
    mean_score += score / kfold.n_splits
    print(f"Fold ROC AUC Score: {score}")



Fold ROC AUC Score: 0.6985212356468088




Fold ROC AUC Score: 0.6981313931748121




Fold ROC AUC Score: 0.6932308931040515




Fold ROC AUC Score: 0.7041196724092615




Fold ROC AUC Score: 0.6978942792924645


In [11]:
mean_score

0.6983794947254798

In [10]:
sub_name = f"submissions/diabetes_3xtestdata_submission.csv"

sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = test_oof_preds
sample_sub.to_csv(sub_name)