In [67]:
# Imports
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from ktools.preprocessing.basic_feature_transformers import *
from ktools.modelling.ktools_models.lgbm_model import LGBMModel
from ktools.fitting.safe_cross_validation_executor import SafeCrossValidationExecutor
import mlflow
from mlflow.data.pandas_dataset import from_pandas
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
import os

In [68]:
warnings.simplefilter("ignore")

In [69]:
all_features = [
 'age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'diagnosed_diabetes']

numeric = [
 'age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'diagnosed_diabetes']

In [70]:
name = "family_history_diabetes"
RUN_NAME = f"adding_{name}_as_categorical"
DESC = ""

In [71]:
# Configuration
train_csv_path = "data/diabetes_prediction/train.csv"
test_csv_path = "data/diabetes_prediction/test.csv"
target_col_name = "diagnosed_diabetes"

# Load and prepare data
settings = DataSciencePipelineSettings(
    train_csv_path,
    test_csv_path,
    target_col_name
)

train, test_df = settings.update()

all_df = pd.concat([train, test_df], keys=["train", "test"])
all_df["categorical_" + name] = all_df[name].fillna(-1).astype("category")

# train = train[features_to_use]
# test_df = test_df[features_to_use]

# all_df = pd.concat([train, test_df], keys=["train", "test"])
# all_df["random_feature_1"] = np.random.rand(all_df.shape[0])
# all_df["random_feature_2"] = np.random.rand(all_df.shape[0])
# all_df["random_feature_3"] = np.random.rand(all_df.shape[0])

train, test_df = all_df.loc["train"], all_df.loc["test"]

# Set MLflow experiment - all runs will be grouped under this experiment
mlflow.set_experiment("diabetes-prediction-cv")

# Define preprocessing transforms
transforms = [
    # FillNullValues.transform,
    ConvertObjectToCategorical.transform,
]

# Start a parent run to log dataset and overall CV metrics
with mlflow.start_run(run_name=RUN_NAME) as parent_run:
    # Create and log the training dataset with proper schema
    train_dataset = from_pandas(
        train,
        source=train_csv_path,
        targets=target_col_name,
        name="diabetes_train_data"
    )
    mlflow.log_input(train_dataset, context="training")
    
    # Log test dataset as well
    test_dataset = from_pandas(
        test_df,
        source=test_csv_path,
        name="diabetes_test_data"
    )
    mlflow.log_input(test_dataset, context="testing")
    
    # Enable autologging - this will create child runs for each fold
    mlflow.lightgbm.autolog()
    
    # Set up cross-validation executor
    cv = SafeCrossValidationExecutor(
        sklearn_model_instance=LGBMModel(num_boost_round=1000),
        evaluation_metric=roc_auc_score,
        kfold_object=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        train_csv_path=train_csv_path,
        test_csv_path=test_csv_path,
        target_col_name=target_col_name,
        num_classes=2,
        pipeline_transforms=transforms
    )
    
    # Run cross-validation
    score_tuple, oof_predictions, model_list, test_predictions = cv.run(train, test_data=test_df)
    
    # Log overall CV metrics to parent run
    mlflow.log_metric("cv_oof_score", score_tuple[0])
    mlflow.log_metric("cv_mean_score", score_tuple[1])

    
    # Log additional parameters
    mlflow.log_param("n_folds", 5)
    
    print(f"\nParent run ID: {parent_run.info.run_id}")
    print(f"View results at: {mlflow.get_tracking_uri()}")

INFO:cross_validation_log:The CV results of the current fold is 0.7252647974175123
INFO:cross_validation_log:The CV results of the current fold is 0.7246800761347548
INFO:cross_validation_log:The CV results of the current fold is 0.7242439268765224
INFO:cross_validation_log:The CV results of the current fold is 0.7257604075865645
INFO:cross_validation_log:The CV results of the current fold is 0.7258381595421911
5it [02:56, 35.27s/it]

####################################################################################################
OOF prediction score :  0.7251522030581062
Mean 5-cv results : 0.7251574735115089 +- 0.0006165804248685917
####################################################################################################

Parent run ID: be55c758dac94b9285799466992d9d72
View results at: sqlite:///mlflow.db





In [72]:
sub_name = f"submissions/diabetes_prediction_{RUN_NAME}_submission.csv"

sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = test_predictions
sample_sub.to_csv(sub_name)

In [73]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [74]:
# os.system(f"export KAGGLE_API_TOKEN={os.getenv('KAGGLE_API_TOKEN')} && kaggle competitions submit -c playground-series-s5e12 -f {sub_name} -m '{DESC}'")