In [1]:
# Imports
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from ktools.preprocessing.basic_feature_transformers import *
from ktools.models import LGBMModel
from ktools.fitting.safe_cross_validation_executor import SafeCrossValidationExecutor
import mlflow
from mlflow.data.pandas_dataset import from_pandas
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
import os

In [2]:
warnings.simplefilter("ignore")

In [3]:
from pathlib import Path
import dotenv
from ktools.utils.find_kaggle_datasets import find_competition_info


dotenv.load_dotenv()
data_dir = Path(os.getenv("DATA_DIR"))
train_csv_path, test_csv_path, sample_sub_csv_path, target_col_name = find_competition_info(data_dir / "diabetes_prediction")
train_df = pd.read_csv(train_csv_path, index_col=0)
test_df = pd.read_csv(test_csv_path, index_col=0)

In [4]:
train = train_df.iloc[:677469]
valid = train_df.iloc[677469:]

In [5]:
valid = valid.drop(columns=target_col_name)

In [6]:
class AppendCloseTestData(IFeatureTransformer):
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        train_df = pd.concat([train_df, test_df])
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        return settings

In [7]:
test_df[target_col_name] = None

In [8]:
test_df

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,...,Female,White,Highschool,Middle,Former,Employed,0,0,0,
700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,...,Female,White,Highschool,Middle,Never,Unemployed,0,0,0,
700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,...,Male,White,Highschool,Low,Never,Employed,0,0,0,
700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,...,Male,White,Graduate,Middle,Former,Employed,0,0,0,
700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,...,Male,White,Graduate,Low,Current,Unemployed,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,59,3,185,6.3,7.3,4.4,22.8,0.81,108,75,...,Male,White,Highschool,Upper-Middle,Former,Employed,1,0,0,
999996,50,2,25,5.8,7.8,4.5,29.6,0.93,112,70,...,Male,Asian,Postgraduate,Lower-Middle,Never,Employed,0,0,0,
999997,63,1,252,5.2,7.5,8.5,25.1,0.77,129,63,...,Female,White,Highschool,Middle,Never,Employed,0,0,0,
999998,48,3,72,4.9,6.9,1.8,27.7,0.89,121,67,...,Male,White,Highschool,Low,Current,Retired,0,1,0,


In [9]:
# Define preprocessing transforms
transforms = [
    # FillNullValues.transform,
    AppendCloseTestData.transform,
    NanUnknownCategoricals.transform,
    FillNullValues.transform,
    ConvertObjectToCategorical.transform,
]

    
# Set up cross-validation executor
cv = SafeCrossValidationExecutor(
    sklearn_model_instance=LGBMModel(num_boost_round=1000),
    evaluation_metric=roc_auc_score,
    kfold_object=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    train_csv_path=train_csv_path,
    test_csv_path=test_csv_path,
    target_col_name=target_col_name,
    num_classes=2,
    pipeline_transforms=transforms
)

# Run cross-validation
score_tuple, oof_predictions, model_list, test_predictions = cv.run(train, test_data=test_df)

0it [00:00, ?it/s]

Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status as they are unknown in train set.
Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status 

INFO:cross_validation_log:The CV results of the current fold is 0.7671085175241416
1it [00:31, 31.94s/it]

Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status as they are unknown in train set.
Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status 

INFO:cross_validation_log:The CV results of the current fold is 0.7657479292882121
2it [01:04, 32.24s/it]

Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status as they are unknown in train set.
Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status 

INFO:cross_validation_log:The CV results of the current fold is 0.7656220376312837
3it [01:37, 32.84s/it]

Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status as they are unknown in train set.
Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status 

INFO:cross_validation_log:The CV results of the current fold is 0.7650398217488337
4it [02:10, 32.54s/it]

Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status as they are unknown in train set.
Setting 0 values to NaN in test column gender as they are unknown in train set.
Setting 0 values to NaN in test column ethnicity as they are unknown in train set.
Setting 0 values to NaN in test column education_level as they are unknown in train set.
Setting 0 values to NaN in test column income_level as they are unknown in train set.
Setting 0 values to NaN in test column smoking_status as they are unknown in train set.
Setting 0 values to NaN in test column employment_status 

INFO:cross_validation_log:The CV results of the current fold is 0.7661479133574869
5it [02:42, 32.59s/it]

####################################################################################################
OOF prediction score :  0.7659332712601457
Mean 5-cv results : 0.7659332439099916 +- 0.0006865486224304354
####################################################################################################





In [16]:
sub_name = f"submissions/diabetes_prediction_withvaldata_1000trees_submission.csv"

sample_sub = pd.read_csv(sample_sub_csv_path, index_col=0)
sample_sub[target_col_name] = test_predictions
sample_sub.to_csv(sub_name)

In [73]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [74]:
# os.system(f"export KAGGLE_API_TOKEN={os.getenv('KAGGLE_API_TOKEN')} && kaggle competitions submit -c playground-series-s5e12 -f {sub_name} -m '{DESC}'")