In [69]:
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from ktools.preprocessing.basic_feature_transformers import *
from ktools.modelling.ktools_models.lgbm_model import LGBMModel
from ktools.fitting.safe_cross_validation_executor import SafeCrossValidationExecutor

In [70]:
train_csv_path = "data/diabetes_prediction/train.csv"
test_csv_path = "data/diabetes_prediction/test.csv"
target_col_name = "diagnosed_diabetes"

In [71]:
df = pd.read_csv(train_csv_path)
df.columns.tolist()

['id',
 'age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'diagnosed_diabetes']

In [97]:
plot_cross_tabulation(df[target_col_name].astype(int).apply(lambda x : "Has diabetes" if x == 1 else "No diabetes"), df["employment_status"])

In [None]:
from functools import reduce


train_csv_path = "data/diabetes_prediction/train.csv"
test_csv_path = "data/diabetes_prediction/test.csv"
target_col_name = "diagnosed_diabetes"


settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name
                                        )

# settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train, test_df = settings.update()
# test_df.drop(columns=[target_col_name], inplace=True)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

transforms = [
            FillNullValues.transform,
            ConvertObjectToCategorical.transform,
            ]

cv = SafeCrossValidationExecutor(
    sklearn_model_instance=LGBMModel(num_boost_round=1000),
    evaluation_metric=roc_auc_score,
    kfold_object=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    train_csv_path=train_csv_path,
    test_csv_path=test_csv_path,
    target_col_name=target_col_name,
    num_classes=2,
    pipeline_transforms=transforms
)

In [None]:
score_tuple, oof_predictions, model_list, test_predictions = cv.run(train, test_data=test_df)

In [None]:
import pandas as pd
from scipy.stats import rankdata

In [None]:
pred_1 = pd.read_csv("submissions/diabetes_prediction_rankingnn_submission.csv")
pred_2 = pd.read_csv("submissions/diabetes_prediction_lgbm_1000trees_submission.csv")

In [None]:
ensemble = 0.1*rankdata(pred_1["diagnosed_diabetes"]) + 0.9*rankdata(pred_2["diagnosed_diabetes"])

In [None]:
sample_sub = pd.read_csv("data/diabetes_prediction/sample_submission.csv", index_col=0)
sample_sub["diagnosed_diabetes"] = ensemble
sample_sub.to_csv("submissions/diabetes_prediction_lgbm+ranknn_submission.csv")