In [1]:
import pandas as pd
import numpy as np
import openfe
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.create_oof_from_model import create_oofs_from_model
from ktools.modelling.models.catboost_model import CatBoostModel
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings

In [2]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/mental_health/train.csv"
original_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/mental_health/original.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/mental_health/test.csv"
sample_sub_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/mental_health/sample_submission.csv"
target_col_name = "Depression"

In [3]:
# from openfe import OpenFE, transform

# ofe = OpenFE()
# features = ofe.fit(data=X, 
#                    label=y,
#                    task='classification',
#                    # categorical_features=categorical_col_names,
#                    metric='binary_logloss',
#                    min_candidate_features=100,
#                    verbose=False)

In [8]:
def convert_to_class(x):
    x = np.where(x >= 0.5, 1, 0)
    return x

In [3]:
def original_process_func(df : pd.DataFrame):
    df['Age'] = df['Age'].astype(float)
    df['Work/Study Hours'] = df['Work/Study Hours'].astype(float)
    df['Financial Stress'] = df['Financial Stress'].astype(float)
    df['Depression'] = df['Depression'].map({'Yes' : 0, 'No' : 1})
    return df

In [4]:
def create_oofs_from_model(cross_validation_executor : CrossValidationExecutor,
                           X_train,
                           y_train,
                           X_test,
                           additional_data = None,
                           model_string : str = None,
                           directory_path : str = None,
                           sample_submission_file : str = None
                           ):
    score_tuple, oof_predictions, model_list = cross_validation_executor.run(X_train, y_train, 
                                                                             additional_data=additional_data,
                                                                             output_transform_list=[convert_to_class])
    num_splits = cross_validation_executor._num_splits

    test_predictions = np.zeros(X_test.shape[0])
    for model in model_list:
        test_predictions += model.predict(X_test)/num_splits

    model_string = str(cross_validation_executor.model) if model_string is None else model_string
    if directory_path is not None:
        pd.Series(oof_predictions).to_csv(directory_path + model_string + "_oofs.csv")
        pd.Series(test_predictions).to_csv(directory_path + model_string + "_test.csv")

        if sample_submission_file is not None:
            sample_sub = pd.read_csv(sample_submission_file)
            sample_sub.iloc[:, 1] =  test_predictions
            sample_sub.to_csv(f"{model_string}_submission.csv", index=False)
            sample_sub.head()
    
    return score_tuple, oof_predictions, test_predictions, model_list

In [21]:
from copy import deepcopy


class OnlyAllowOriginalFeatureValues:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        train = train_df[train_df['source'] == '0']
        original = train_df[train_df['source'] == '1']

        for col in settings.categorical_col_names:
            common_categories = train[col].cat.categories.union(original[col].cat.categories)
            removed_categories = train[col].cat.categories.difference(common_categories)

            print("Removed categories: ", removed_categories)
            train[col] = train[col].cat.set_categories(common_categories)
            test_df[col] = test_df[col].cat.set_categories(common_categories)
            # original[col] = original[col].cat.set_categories(common_categories)
        
        train = pd.concat([train, original])
        settings.combined_df = pd.concat([train, test_df], keys=['train', 'test'])
        return settings
    
class ThresholdRareCategories:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, threshold=100):
        settings = deepcopy(original_settings)

        for col in settings.categorical_col_names:
            value_counts = settings.combined_df[col].value_counts()
            rare_categories = value_counts[value_counts < threshold].index
            settings.combined_df[col] = settings.combined_df[col].replace(rare_categories, np.nan)
            
        return settings

In [28]:
from functools import reduce
from ktools.preprocessing.basic_feature_transformers import *


settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name, 
                                        original_csv_path=original_csv_path,
                                        original_csv_processing=original_process_func
                                        )
transforms = [
            ThresholdRareCategories.transform,
            FillNullValues.transform,
            ConvertAllToCategorical.transform,
            # ConvertObjectToCategorical.transform,
            ]

settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train_df, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)

In [29]:
train_df

Unnamed: 0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,source
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,-1.0,5.0,-1.0,-1.0,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0.0,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,-1.0,4.0,-1.0,-1.0,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1.0,0
2,Yuvraj,Male,33.0,Visakhapatnam,Student,missing,5.0,-1.0,8.97,2.0,-1.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1.0,0
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,-1.0,5.0,-1.0,-1.0,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1.0,0
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,-1.0,1.0,-1.0,-1.0,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143251,Raghavendra,Male,25.0,Bangalore,Working Professional,Consultant,-1.0,1.0,-1.0,-1.0,5.0,5-6 hours,Healthy,BBA,Yes,12.0,3.0,Yes,1.0,1
143252,Pihu,Female,23.0,Pune,Working Professional,Teacher,-1.0,3.0,-1.0,-1.0,1.0,Less than 5 hours,Moderate,MA,Yes,8.0,3.0,No,0.0,1
143253,Sara,Female,24.0,Srinagar,Working Professional,HR Manager,-1.0,1.0,-1.0,-1.0,4.0,Less than 5 hours,Moderate,BA,Yes,4.0,4.0,No,1.0,1
143254,Eshita,Female,56.0,Bangalore,Working Professional,Business Analyst,-1.0,2.0,-1.0,-1.0,3.0,7-8 hours,Healthy,BBA,No,4.0,5.0,Yes,1.0,1


In [30]:
train = train_df[train_df['source'] == '0']
original = train_df[train_df['source'] == '1']

train.drop(columns='source', inplace=True)
original.drop(columns='source', inplace=True)
test_df.drop(columns='source', inplace=True)

X, y = train.drop(columns=target_col_name), train[[target_col_name]]
Xog, yog = original.drop(columns=target_col_name), original[[target_col_name]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns='source', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original.drop(columns='source', inplace=True)


In [31]:
params = {
            'task_type'           : "CPU",
            'loss_function'       : 'Logloss',
            'eval_metric'         : "Accuracy",
            'bagging_temperature' : 0.25,
            'colsample_bylevel'   : 0.40,
            'num_boost_round'     : 5_000,
            'learning_rate'       : 0.045,
            'max_depth'           : 7,
            'l2_leaf_reg'         : 0.80,
            'min_data_in_leaf'    : 30,
            'random_strength'     : 0.25,
            'random_state'        : 42,
            'early_stopping_rounds': 200,
            'use_best_model'       : True,
            'grow_policy' : "SymmetricTree"
}

cat_model = CatBoostModel(**params, predict_type='prob')

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(cat_model,
                            accuracy_score,
                            kf,
                            verbose=2
                            )

_ = create_oofs_from_model(cve,
                       X,
                       y,
                       test_df,
                    #    additional_data=[Xog, yog],
                       model_string="scriptchef_cat",
                     #   directory_path="data/mental_health/oofs/",
                       sample_submission_file=sample_sub_csv_path)

The CV results of the current fold is 0.938592750533049
The CV results of the current fold is 0.9411513859275054
The CV results of the current fold is 0.93909026297086
The CV results of the current fold is 0.9388770433546553
The CV results of the current fold is 0.9415067519545132
The CV results of the current fold is 0.9398720682302771
The CV results of the current fold is 0.9414356787491116
The CV results of the current fold is 0.9440653873489694
The CV results of the current fold is 0.9412224591329069
The CV results of the current fold is 0.9407960199004975
####################################################################################################
OOF prediction score :  0.9406609808102345
Mean 10-cv results : 0.9406609808102345 +- 0.0015501292579130584
####################################################################################################


In [9]:
params = {
            'task_type'           : "CPU",
            'loss_function'       : 'Logloss',
            'eval_metric'         : "Accuracy",
            'bagging_temperature' : 0.25,
            'colsample_bylevel'   : 0.40,
            'num_boost_round'     : 5_000,
            'learning_rate'       : 0.045,
            'max_depth'           : 7,
            'l2_leaf_reg'         : 0.80,
            'min_data_in_leaf'    : 30,
            'random_strength'     : 0.25,
            'random_state'        : 42,
            'early_stopping_rounds': 200,
            'use_best_model'       : True,
            'grow_policy' : "Depthwise"
}

cat_model = CatBoostModel(**params, predict_type='prob')

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(cat_model,
                            accuracy_score,
                            kf,
                            verbose=2
                            )

_ = create_oofs_from_model(cve,
                       X,
                       y,
                       test_df,
                    #    additional_data=[Xog, yog],
                       model_string="scriptchef_cat_dw",
                       directory_path="data/mental_health/oofs/",
                       sample_submission_file=sample_sub_csv_path)

The CV results of the current fold is 0.9380241648898365
The CV results of the current fold is 0.940724946695096
The CV results of the current fold is 0.9384506041222459
The CV results of the current fold is 0.9378109452736318
The CV results of the current fold is 0.9394456289978678
The CV results of the current fold is 0.9400142146410803
The CV results of the current fold is 0.9406538734896944
The CV results of the current fold is 0.9421464108031272
The CV results of the current fold is 0.938592750533049
The CV results of the current fold is 0.9385216773276475
####################################################################################################
OOF prediction score :  0.9394385216773277
Mean 10-cv results : 0.9394385216773277 +- 0.0013407993635204964
####################################################################################################


  sample_sub.iloc[:, 1] =  test_predictions


In [10]:
params = {
            'task_type'           : "CPU",
            'loss_function'       : 'Logloss',
            'eval_metric'         : "Accuracy",
            'bagging_temperature' : 0.25,
            'colsample_bylevel'   : 0.40,
            'num_boost_round'     : 5_000,
            'learning_rate'       : 0.045,
            'max_depth'           : 7,
            'l2_leaf_reg'         : 0.80,
            'min_data_in_leaf'    : 30,
            'random_strength'     : 0.25,
            'random_state'        : 42,
            'early_stopping_rounds': 200,
            'use_best_model'       : True,
            'grow_policy' : "Lossguide"
}

cat_model = CatBoostModel(**params, predict_type='prob')

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(cat_model,
                            accuracy_score,
                            kf,
                            verbose=2
                            )

_ = create_oofs_from_model(cve,
                       X,
                       y,
                       test_df,
                    #    additional_data=[Xog, yog],
                       model_string="scriptchef_cat_lg",
                       directory_path="data/mental_health/oofs/",
                       sample_submission_file=sample_sub_csv_path)

The CV results of the current fold is 0.9373845060412225
The CV results of the current fold is 0.9396588486140725
The CV results of the current fold is 0.9391613361762615
The CV results of the current fold is 0.9378820184790334
The CV results of the current fold is 0.938592750533049
The CV results of the current fold is 0.9399431414356787
The CV results of the current fold is 0.9389481165600568
The CV results of the current fold is 0.9417910447761194
The CV results of the current fold is 0.9388059701492537
The CV results of the current fold is 0.9385216773276475
####################################################################################################
OOF prediction score :  0.9390689410092395
Mean 10-cv results : 0.9390689410092395 +- 0.0011570097896773624
####################################################################################################


  sample_sub.iloc[:, 1] =  test_predictions


In [21]:
df = pd.read_csv("scriptchef_cat_1xog_submission.csv")
df['Depression'] = convert_to_class(df['Depression'])
df.to_csv("scriptchef_cat_1xog_submission.csv", index=False)

In [32]:
X, y = train_df.drop(columns=target_col_name), train_df[[target_col_name]]

In [28]:
from ktools.modelling.models.xgb_model import XGBoostModel


xgb_params = {"max_bin" : 10000, "early_stopping_rounds" : 200, "num_boost_round" : 10000, "objective" : "binary:logistic", "eval_metric": 'logloss', "booster" : "gbtree", "grow_policy" : "lossguide", "sampling_method" : "uniform", 'learning_rate': 0.018987111356915002, 'max_depth': 29, 'gamma': 3.8178499121301717, 'min_child_weight': 26.925766438650484, 'subsample': 0.9311065946772249, 'colsample_bytree': 0.5668803479675172, 'colsample_bylevel': 0.713440247814915, 'colsample_bynode': 0.6843838669968836, 'reg_alpha': 0.0020902285326352865, 'reg_lambda': 9.3513032909424e-05, 'max_cat_threshold': 29, 'scale_pos_weight': 1.174724631745976}
xgb_model = XGBoostModel(**xgb_params)

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(xgb_model,
                                accuracy_score,
                                kf,
                                verbose=2
                                )

_ = create_oofs_from_model(cve,
                       X,
                       y,
                       test_df,
                    #    additional_data=[Xog, yog],
                       model_string="scriptchef_xgblg_v1",
                       directory_path="data/mental_health/oofs/",
                       sample_submission_file=sample_sub_csv_path)


The CV results of the current fold is 0.9395167022032693
The CV results of the current fold is 0.9412224591329069
The CV results of the current fold is 0.9375977256574272
The CV results of the current fold is 0.9383084577114428
The CV results of the current fold is 0.9398009950248756
The CV results of the current fold is 0.9411513859275054
The CV results of the current fold is 0.9398009950248756
The CV results of the current fold is 0.9433546552949538
The CV results of the current fold is 0.9380241648898365
The CV results of the current fold is 0.9389481165600568
####################################################################################################
OOF prediction score :  0.939772565742715
Mean 10-cv results : 0.9397725657427151 +- 0.0016586725965389972
####################################################################################################


ValueError: feature_names mismatch: ['Name', 'Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness'] ['Name', 'Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']
training data did not have the following fields: Depression

In [None]:
0.939516
0.93987

In [17]:
from ktools.modelling.models.lgbm_model import LGBMModel


lgb_params = {'early_stopping_rounds':200, 'num_boost_round' : 10000, 'objective': 'binary', 'metric': 'binary_logloss', "boosting_type" : "gbdt", "data_sample_strategy" : "bagging",'num_leaves': 451, 'max_depth': 26, 'learning_rate': 0.010610947927564743, 'subsample': 0.7773063942362304, 'colsample_bytree': 0.6356226796669067, 'reg_alpha': 0.0019295870197797132, 'reg_lambda': 1.4174471390150498e-06, 'min_data_in_leaf': 96, 'feature_fraction': 0.5805911201119025, 'max_bin': 454, 'min_child_weight': 14.361285637650104, 'scale_pos_weight': 1.0192451243940603, 'cat_smooth': 98.8188160993827}
# lgb_params = {'early_stopping_rounds':200, 'num_boost_round' : 10000, 'objective': 'binary', 'metric': 'binary_logloss', "boosting_type" : "gbdt", "data_sample_strategy" : "goss", 'num_leaves': 210, 'max_depth': 41, 'learning_rate': 0.01008246800628356, 'subsample': 0.623952789752688, 'colsample_bytree': 0.9015315945121252, 'reg_alpha': 1.1503157130258752e-05, 'reg_lambda': 0.003349460447257045, 'min_data_in_leaf': 73, 'feature_fraction': 0.7054559290882589, 'max_bin': 999, 'min_child_weight': 0.006965495868558221, 'scale_pos_weight': 1.0101069429100957, 'cat_smooth': 72.72251503864902}
lgb_model = LGBMModel(**lgb_params)

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(lgb_model,
                                accuracy_score,
                                kf,
                                verbose=2
                                )

_ = create_oofs_from_model(cve,
                       X,
                       y,
                       test_df,
                    #    additional_data=[Xog, yog],
                       model_string="scriptchef_lgbbag_v1",
                       directory_path="data/mental_health/oofs/",
                       sample_submission_file=sample_sub_csv_path)
# _ = cve.run(X, y, 
#             additional_data=[Xog, yog],
#             output_transform_list=[convert_to_class])

The CV results of the current fold is 0.9395877754086709
The CV results of the current fold is 0.9427860696517413
The CV results of the current fold is 0.9387348969438521
The CV results of the current fold is 0.9398009950248756
The CV results of the current fold is 0.9398720682302771
The CV results of the current fold is 0.9407960199004975
The CV results of the current fold is 0.9400852878464819
The CV results of the current fold is 0.9428571428571428
The CV results of the current fold is 0.9398009950248756
The CV results of the current fold is 0.9398009950248756
####################################################################################################
OOF prediction score :  0.9404122245913291
Mean 10-cv results : 0.9404122245913289 +- 0.0012950934447639702
####################################################################################################


 8.49952159e-01 9.27837548e-03]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  sample_sub.iloc[:, 1] =  test_predictions


In [18]:
df = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/scriptchef_lgbbag_v1_submission.csv")
df[target_col_name] = np.round(df[target_col_name])
df[target_col_name] = df[target_col_name].astype(int)
df

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
...,...,...
93795,234495,0
93796,234496,1
93797,234497,0
93798,234498,1


In [19]:
df.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/scriptchef_lgbbag_v1_submission.csv", index=False)