In [16]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
import pytest
from ktools.models import LGBMModel
from ktools.preprocessing.categorical import CategoricalEncoder
from ktools.preprocessing.numerical import StandardScale
from ktools.preprocessing.pipe import PreprocessingPipeline
from ktools.config.dataset import DatasetConfig
from ktools.fitting.pipe import ModelPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
DATA_PATH = Path("./data/diabetes_prediction/")
TARGET = "diagnosed_diabetes"

# id split
split_id = 678000

orig_data = pd.read_csv(DATA_PATH / "original.csv")
train_data = pd.read_csv(DATA_PATH / "train.csv", index_col=0)
test_data = pd.read_csv(DATA_PATH / "test.csv", index_col=0).assign(data=0)

test_data["data"] = test_data["data"].astype("category")
# orig_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)
# train_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)
# test_data.drop(columns=["physical_activity_minutes_per_week"], inplace=True)

In [4]:
assert (train_data.columns == train_data.columns.intersection(orig_data.columns)).all()

In [5]:
orig_data[orig_data.columns.difference(train_data.columns)].dtypes

diabetes_risk_score     float64
diabetes_stage           object
glucose_fasting           int64
glucose_postprandial      int64
hba1c                   float64
insulin_level           float64
dtype: object

In [6]:
# orig_data = orig_data.drop(columns=orig_data.columns.difference(train_data.columns).to_list())
# orig_data = orig_data.assign(data=2)
train_data = train_data.assign(data=np.nan)
train_data.iloc[:split_id, -1] = 1
train_data.iloc[split_id:, -1] = 0

In [7]:
TARGET = 'diagnosed_diabetes'
BASE = [col for col in train_data.columns if col not in ['id', TARGET, 'data']]


ORIG = []

for col in BASE:
    # MEAN

    for tgt in [TARGET, 'glucose_fasting','glucose_postprandial', 'hba1c']: #TARGET, 'glucose_fasting','glucose_postprandial', 'hba1c'
        
        mean_map = orig_data.groupby(col)[tgt].mean()
        new_mean_col_name = f"orig_mean_{tgt}_grouped_by_{col}"
        mean_map.name = new_mean_col_name
        
        print(col, tgt)
        train_data = train_data.merge(mean_map, on=col, how='left')
        test_data = test_data.merge(mean_map, on=col, how='left')
        ORIG.append(new_mean_col_name)

    # COUNT
    # new_count_col_name = f"orig_count_{col}"
    # count_map = orig_data.groupby(col).size().reset_index(name=new_count_col_name)
    
    # train_data = train_data.merge(count_map, on=col, how='left')
    # test_data = test_data.merge(count_map, on=col, how='left')
    # ORIG.append(new_count_col_name)

print(len(ORIG), 'Orig Features Created!!')

24 Base Features:['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']


In [9]:
training_col_names = train_data.drop(columns=TARGET).columns.tolist()

numerical_col_names = (
    train_data.drop(columns=TARGET)
    .select_dtypes(include=["number"])
    .columns.tolist()
)
categorical_col_names = train_data.select_dtypes(
    include=["object"]
).columns.tolist()

config = DatasetConfig(
    training_col_names=training_col_names,
    numerical_col_names=numerical_col_names + ORIG,
    categorical_col_names=categorical_col_names,
    target_col_name=TARGET,
)

In [10]:
categories_of_interest  = train_data["diagnosed_diabetes"].astype(str) + "_" + train_data["data"].astype(str)

In [11]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

preprocessors = [StandardScale(config), CategoricalEncoder(config)]

train_oof_preds = np.empty(train_data.shape[0])
test_oof_preds = np.zeros(test_data.shape[0])

mean_score: float = 0.0
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in kfold.split(
    train_data, categories_of_interest
):
    train_fold: pd.DataFrame = train_data.iloc[train_index].copy()
    val_fold: pd.DataFrame = train_data.iloc[val_index]
    subsetval_fold = val_fold[val_fold["data"] == 0.0]
    # weights = np.array([1.0] * train_fold.shape[0] + [1.0] * val_data.shape[0])
    train_fold["data"] = train_fold["data"].astype("category")

    weights = np.where(train_fold["data"] == 0, 1.0, 1.0)
    # assert train_fold.shape[0] == weights.shape[0], f"train_fold shape: {train_fold.shape[0]}, weights shape: {weights.shape[0]}"

    pipe = ModelPipeline(
        model=LGBMModel(),
        config=config,
        preprocessor=PreprocessingPipeline(preprocessors=preprocessors),
    )

    pipe.fit(train_fold, validation_data=subsetval_fold, weights=weights)
    y_pred = pipe.predict(subsetval_fold)
    test_pred = pipe.predict(test_data)
    oof_pred = pipe.predict(val_fold)

    score = roc_auc_score(subsetval_fold[TARGET], y_pred)
    train_oof_preds[val_index] = oof_pred
    test_oof_preds += test_pred / kfold.n_splits
    mean_score += score / kfold.n_splits
    print(f"Fold ROC AUC Score: {score}")



Fold ROC AUC Score: 0.7063007245707997




Fold ROC AUC Score: 0.7031573724177882




Fold ROC AUC Score: 0.700027425220127




Fold ROC AUC Score: 0.709197733200075




Fold ROC AUC Score: 0.6968453964746286


In [12]:
import uuid

# Generate a random UUID
guid = uuid.uuid4()
print(guid)

7a337b63-612f-4a11-968a-7028bbb182c7


In [15]:
mean_score

0.7031057303766837

In [13]:
save_path = Path("./data/diabetes_prediction/")

pd.DataFrame({
    f"{guid}" : train_oof_preds,
}).to_csv(save_path / "oofs" / f"oof_preds_{guid}.csv")

pd.DataFrame({
    f"{guid}" : test_oof_preds,
}).to_csv(save_path / "test_preds" / f"test_preds_{guid}.csv")

In [2]:
# sub_name = f"submissions/diabetes_submission.csv"
import pandas as pd
pred = pd.read_csv("/workspaces/Kaggle-tools/submission (49).csv")["diagnosed_diabetes"]
sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = pred.to_numpy()
sample_sub.to_csv("mikhail->ktools_notrain.csv")

In [18]:
# Combine fold-level predictions into single OOF and averaged test predictions
import pandas as pd
import numpy as np
from pathlib import Path
import uuid

save_path = Path("./data/diabetes_prediction/")
n_folds = 5

# Get total number of training samples from train_data
n_train_samples = train_data.shape[0]

# Initialize array to hold OOF predictions in correct order
combined_oof = np.empty(n_train_samples)

# Load OOF predictions and place them at the correct indices
for i in range(n_folds):
    fold_df = pd.read_csv(save_path / "oofs" / f"oof_preds_fold_{i}.csv", index_col=0)
    # The index column contains the original row indices from the CV split
    indices = fold_df.index.values
    predictions = fold_df.iloc[:, 0].values
    combined_oof[indices] = predictions

print(f"Combined OOF shape: {combined_oof.shape}")

# Load and average test predictions
test_dfs = []
for i in range(n_folds):
    fold_df = pd.read_csv(save_path / "test_preds" / f"test_preds_fold_{i}.csv")
    test_dfs.append(fold_df.values.flatten())

# Average across folds
averaged_test = np.mean(test_dfs, axis=0)
print(f"Averaged test shape: {averaged_test.shape}")

# Generate a unique ID for this combined prediction set
combined_guid = uuid.uuid4()
print(f"Combined predictions GUID: {combined_guid}")

# Save combined OOF predictions
pd.DataFrame({
    f"{combined_guid}": combined_oof,
}).to_csv(save_path / "oofs" / f"oof_preds_{combined_guid}.csv")

# Save averaged test predictions  
pd.DataFrame({
    f"{combined_guid}": averaged_test,
}).to_csv(save_path / "test_preds" / f"test_preds_{combined_guid}.csv")

print(f"Saved OOF predictions to: oofs/oof_preds_{combined_guid}.csv")
print(f"Saved test predictions to: test_preds/test_preds_{combined_guid}.csv")

Combined OOF shape: (700000,)
Averaged test shape: (300000,)
Combined predictions GUID: ef225044-04a9-4761-83b3-eb7bb507da92
Saved OOF predictions to: oofs/oof_preds_ef225044-04a9-4761-83b3-eb7bb507da92.csv
Saved test predictions to: test_preds/test_preds_ef225044-04a9-4761-83b3-eb7bb507da92.csv
