In [1]:
import pandas as pd
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings

In [2]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/train.csv"
original_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/original.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/test.csv"
target_col_name = "loan_status"

In [7]:
from copy import deepcopy
from functools import reduce
import numpy as np
from ktools.preprocessing.basic_feature_transformers import ConvertObjectToCategorical, FillNullValues


class AddOGData:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        original_data = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/original.csv", index_col=0)
        settings.train_df = pd.concat([settings.train_df.assign(source=0), original_data.assign(source=1)]).reset_index(drop=True)
        settings.combined_df = pd.concat([settings.train_df, settings.test_df.assign(source=0)], keys=['train', 'test'])
        settings.categorical_col_names += ['person_income']
        return settings
    
class AddLoanPrediction:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df['loantoincome'] = (settings.combined_df['loan_amnt'] / settings.combined_df['person_income']) - settings.combined_df['loan_percent_income']
        settings.training_col_names += ['loantoincome']
        return settings
        
class ConvertObjectToCategorical():
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype(str).astype('category')
        return settings
    
class ConvertAllToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.training_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype(str).astype('category')
        return settings

In [8]:
settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name, 
                                        # original_csv_path=original_csv_path
                                        )
transforms = [
            AddOGData.transform,
            AddLoanPrediction.transform,
            FillNullValues.transform,
            ConvertAllToCategorical.transform,
            # ConvertObjectToCategorical.transform,
            ]

settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train_df, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)
X, y = train_df.drop(columns=target_col_name), train_df[target_col_name]

In [9]:
# train = train_df[train_df['source'] == 0].drop(columns='source')
# original = train_df[train_df['source'] == 1].drop(columns='source')

In [12]:
train_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,source,loantoincome
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0.0,0,0.001428571428571418
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0.0,0,0.001428571428571418
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0.0,0,-0.0016666666666666496
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0.0,0,0.001428571428571418
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91221,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0.0,1,-0.0005660377358490537
91222,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0.0,1,-0.003124999999999989
91223,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1.0,1,0.0005263157894736525
91224,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.1,N,26,0.0,1,0.0


In [11]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.create_oof_from_model import create_oofs_from_model
from ktools.modelling.models.catboost_model import CatBoostModel
from ktools.modelling.models.lgbm_model import LGBMModel


lgb_params = {'objective': 'binary', 'metric': 'binary_logloss', 'num_boost_round' : 5000, 'early_stopping_rounds' : 200}
lgb_model = LGBMModel(**lgb_params)

kf = StratifiedKFold(10, shuffle=True, random_state=42)

cve = CrossValidationExecutor(lgb_model,
                                roc_auc_score,
                                kf,
                                verbose=2
                                )

_ = create_oofs_from_model(cve,
                       train_df.drop(columns=target_col_name),
                       train_df[[target_col_name]],
                       test_df,
                       None,
                       "")

The CV results of the current fold is 0.9611847900152654
The CV results of the current fold is 0.9647547789225956
The CV results of the current fold is 0.9593130193659881
The CV results of the current fold is 0.9607571835579897
The CV results of the current fold is 0.9566103655766302
The CV results of the current fold is 0.9601574759591951
The CV results of the current fold is 0.9616758773891179
The CV results of the current fold is 0.9538220120249794
The CV results of the current fold is 0.9633363375321538
The CV results of the current fold is 0.9550227381080857
####################################################################################################
OOF prediction score :  0.959595445318167
Mean 10-cv results : 0.9596634578452001 +- 0.003351404449022676
####################################################################################################


In [20]:
test_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.10,Y,4
2,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.14,N,7
4,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4
...,...,...,...,...,...,...,...,...,...,...,...
39093,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4
39094,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
39095,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
39096,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4


In [21]:
cve = CrossValidationExecutor(lgb_model,
                              roc_auc_score,
                              kf,
                              verbose=2
                              )

_ = create_oofs_from_model(cve,
                       train.drop(columns=target_col_name), 
                       train[[target_col_name]],
                       test_df,
                       additional_data=[original.drop(columns=target_col_name), original[[target_col_name]]])

# .run(train.drop(columns=target_col_name), train[[target_col_name]], 
#                                                                   additional_data=[original.drop(columns=target_col_name), original[[target_col_name]]])

The CV results of the current fold is 0.95966905155891
The CV results of the current fold is 0.9550431542481638
The CV results of the current fold is 0.9678543112581992
The CV results of the current fold is 0.9658008833228175
The CV results of the current fold is 0.9613402221402125
The CV results of the current fold is 0.9595995918284727
The CV results of the current fold is 0.9612865738001032
The CV results of the current fold is 0.9630207074417481
The CV results of the current fold is 0.9627268429932738
The CV results of the current fold is 0.9612644268035813
####################################################################################################
OOF prediction score :  0.9616277340933997
Mean 10-cv results : 0.9617605765395482 +- 0.0033285870479824622
####################################################################################################


In [4]:
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import StratifiedKFold
# from ktools.fitting.cross_validation_executor import CrossValidationExecutor
# from ktools.modelling.models.catboost_model import CatBoostModel


# cat_params = {'loss_function' : "Logloss", 'eval_metric': 'AUC', 'early_stopping_rounds': 48, 'max_bin': 357, 'learning_rate': 0.039926066217791725, 'depth': 11, 'num_boost_round': 4349, 'bagging_temperature': 5.753214521222872, 'subsample': 0.9181545312144901, 'colsample_bylevel': 0.6006232656816098, 'min_data_in_leaf': 353.5824003756882, 'l2_leaf_reg': 6.122291037651919, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'scale_pos_weight': 2.2637139313632977, 'random_strength': 6.881176315494768, 'leaf_estimation_method': 'Newton'}

# cat_model = CatBoostModel(**cat_params)

# kf = StratifiedKFold(10, shuffle=True, random_state=42)

# score_tuple, oof_preds, model_list = CrossValidationExecutor(cat_model,
#                                                             roc_auc_score,
#                                                             kf,
#                                                             verbose=2
#                                                             ).run(X, y)

In [43]:
import os

def oofs_reader(directory):
    oof_collect = []
    test_collect = []
    for file in sorted(os.listdir(directory)):
        if "arch" in file:
            continue
        if "oof" in file:
            oof_df = pd.read_csv(os.path.join(directory, file)).reset_index(drop=True)
            desc = file[:7]
            if (oof_df.iloc[:3, 0].values == [0,1,2]).all():
                oof_df.drop(columns=oof_df.columns[0], inplace=True)
            cols = list(oof_df.columns)
            oof_df.rename(columns={cn : desc + cn for cn in cols}, inplace=True)
            oof_collect += [oof_df]
        elif "test" in file:
            test_df = pd.read_csv(os.path.join(directory, file)).reset_index(drop=True)
            desc = file[:7]
            if (test_df.iloc[:3, 0].values == [0,1,2]).all():
                test_df.drop(columns=test_df.columns[0], inplace=True)
            cols = list(test_df.columns)
            test_df.rename(columns={cn : desc + cn for cn in cols}, inplace=True)
            test_collect += [test_df]
            
    return pd.concat(oof_collect, axis=1), pd.concat(test_collect, axis=1)

oof_df, test_df = oofs_reader("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction")

In [28]:
trans = lambda x : 1/(1+np.exp(-x))

df = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/bestcat_1_oofs.csv", index_col=0)
tdf = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/bestcat_1_test.csv", index_col=0)


df = trans(df)
tdf = trans(tdf)
tdf

Unnamed: 0,0
0,0.999883
1,0.015100
2,0.443245
3,0.004797
4,0.024369
...,...
39093,0.057530
39094,0.001843
39095,0.003785
39096,0.210342


In [29]:
df.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/bestcat_1_oofs.csv")
tdf.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/bestcat_1_test.csv")

In [None]:
[('person_home_ownership', 'loan_intent'),
('person_home_ownership', 'loan_percent_income'),
('person_home_ownership', 'loan_grade', 'loan_percent_income'),
('loan_intent', 'loan_grade'),
('person_home_ownership', 'loan_intent', 'loan_grade'),
('person_age', 'person_income', 'loan_amnt', 'loan_int_rate'),
('person_emp_length', 'loan_grade'),
('person_income', 'person_home_ownership'),
('loan_grade', 'cb_person_default_on_file', 'source'),
('person_income', 'loan_intent'),
('person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade'),
('loan_intent', 'loan_grade', 'cb_person_default_on_file', 'source'),
('loan_intent', 'loan_grade', 'source'),
('person_home_ownership', 'person_emp_length', 'loan_grade'),
('person_age', 'loan_intent', 'source'),
('person_income', 'cb_person_default_on_file'),
('person_income', 'loan_percent_income'),
('loan_intent', 'source'),
('loan_amnt', 'loan_percent_income'),
('person_home_ownership', 'loan_grade'),
('person_home_ownership', 'loan_amnt', 'loan_percent_income'),
('cb_person_cred_hist_length', 'source'),
('person_home_ownership', 'loan_intent', 'loan_grade', 'source'),
('person_home_ownership', 'loan_percent_income', 'source'),
('person_home_ownership', 'loan_intent', 'cb_person_default_on_file'),
('person_income', 'source'),
('loan_grade', 'loan_percent_income', 'source'),
('person_age', 'loan_intent', 'cb_person_cred_hist_length', 'source'),
('loan_intent', 'loan_int_rate'),
('person_home_ownership', 'loan_grade', 'source'),
('person_emp_length', 'loan_grade', 'loan_int_rate'),
('person_income', 'person_home_ownership', 'loan_amnt'),
('loan_intent', 'loan_grade', 'cb_person_default_on_file'),
('person_home_ownership', 'cb_person_cred_hist_length'),
('person_income', 'person_home_ownership', 'cb_person_default_on_file'),
('loan_intent', 'loan_amnt', 'loan_percent_income'),
('person_emp_length', 'loan_grade', 'source'),
('person_emp_length', 'loan_percent_income'),
('loan_intent', 'loan_grade', 'loan_amnt', 'cb_person_default_on_file'),
('loan_intent', 'loan_percent_income', 'cb_person_cred_hist_length'),
('person_income', 'person_home_ownership', 'loan_intent', 'cb_person_default_on_file'),
('person_income', 'person_home_ownership', 'loan_intent'),
('person_home_ownership', 'loan_grade', 'cb_person_default_on_file'),
('loan_intent', 'loan_int_rate', 'cb_person_default_on_file'),
('loan_grade', 'loan_percent_income', 'cb_person_default_on_file', 'source'),
('person_home_ownership', 'loan_intent', 'loan_amnt'),
('loan_int_rate', 'source')]

In [58]:
pd.read_csv("/Users/yuwei-1/Downloads/bestcat_2_test.csv", index_col=0).values

array([[ 8.63715881],
       [-4.03324115],
       [-0.18332708],
       ...,
       [-6.2867994 ],
       [-1.1359047 ],
       [ 3.23419269]])

In [33]:
test_df.drop(columns="loan_status", inplace=True)

In [36]:
test_x = pd.concat([pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/ag_addidata_test_v2.csv", index_col=0).reset_index(drop=True), pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/martynov-clean-test.csv", index_col=0).reset_index(drop=True)], axis=1)

In [37]:
test_x

Unnamed: 0,LightGBMXT_BAG_L1,LightGBM_BAG_L1,RandomForestGini_BAG_L1,RandomForestEntr_BAG_L1,CatBoost_BAG_L1,XGBoost_BAG_L1,NeuralNetTorch_BAG_L1,LightGBMLarge_BAG_L1,CatBoost_r177_BAG_L1,NeuralNetTorch_r79_BAG_L1,...,CatBoost_r50_BAG_L2,WeightedEnsemble_L3,LGB1,LGB2,CAT,CAT1,CAT2,XGB0,XGB1,E1
0,0.999782,0.996940,0.971333,0.981333,0.999857,0.999890,0.993219,0.996528,0.999782,0.995067,...,0.989981,0.994533,0.999955,0.998753,0.999939,0.998844,0.999170,0.934882,0.998903,0.999122
1,0.030247,0.008282,0.033333,0.034333,0.016408,0.013243,0.038308,0.010045,0.019432,0.036820,...,0.020210,0.017961,0.017512,0.030356,0.016468,0.020656,0.018543,0.040253,0.038495,0.025796
2,0.533729,0.419893,0.613000,0.607000,0.508671,0.392825,0.850909,0.475263,0.518086,0.280695,...,0.526234,0.490603,0.625537,0.491923,0.453507,0.403703,0.446856,0.492839,0.496028,0.546212
3,0.004681,0.003685,0.015000,0.014000,0.005674,0.004259,0.003913,0.003600,0.005356,0.003581,...,0.004887,0.003979,0.003318,0.004932,0.006137,0.006430,0.007067,0.027091,0.003866,0.004753
4,0.045740,0.049957,0.111333,0.128000,0.025796,0.046305,0.090181,0.043588,0.021682,0.043770,...,0.032104,0.029351,0.020414,0.035369,0.027602,0.031905,0.037643,0.126633,0.037888,0.032808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,0.050781,0.049688,0.066667,0.060667,0.082740,0.058249,0.122060,0.040283,0.078572,0.101660,...,0.072282,0.076062,0.017971,0.048487,0.072559,0.074261,0.082678,0.084184,0.039725,0.040606
39094,0.004882,0.013830,0.000667,0.001333,0.002460,0.005798,0.014152,0.012516,0.002843,0.008644,...,0.005827,0.005419,0.001371,0.005599,0.002218,0.002705,0.002017,0.023074,0.005590,0.004535
39095,0.016162,0.009434,0.015333,0.017333,0.005839,0.012295,0.011422,0.009886,0.006897,0.011757,...,0.011048,0.009905,0.001643,0.006687,0.004546,0.003922,0.004266,0.019017,0.006226,0.005211
39096,0.264254,0.201689,0.425000,0.418000,0.182001,0.217152,0.297378,0.194487,0.197148,0.292000,...,0.194889,0.164657,0.056469,0.145481,0.220278,0.247020,0.209275,0.408997,0.215451,0.132591


In [38]:
test_preds = np.zeros(test_df.shape[0])
for model in model_list:
    test_preds += model.predict(test_x)/10

In [39]:
test_preds

array([0.99304059, 0.01830278, 0.46464564, ..., 0.00491329, 0.18039983,
       0.97982268])

In [51]:
tp = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/basics-widedeep-stack_test.csv", index_col=0).values

In [23]:
sample_sub = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/sample_submission.csv")
sample_sub['loan_status'] =  pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/<ktools.modelling.models.lgbm_model.LGBMModel object at 0x344834580>_test.csv", index_col=0).values
sample_sub.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/submissions/loan_approval/submission_v13.csv", index=False)
sample_sub

Unnamed: 0,id,loan_status
0,58645,0.997912
1,58646,0.022062
2,58647,0.467068
3,58648,0.009314
4,58649,0.060857
...,...,...
39093,97738,0.070677
39094,97739,0.009927
39095,97740,0.005141
39096,97741,0.243219


In [11]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from ktools.modelling.create_hill_climber_from_params import CreateHillClimber
from ktools.modelling.models.catboost_model import CatBoostModel
from ktools.modelling.models.lgbm_model import LGBMModel
from ktools.modelling.models.xgb_model import XGBoostModel

# {'loss_function' : "Logloss", 'eval_metric': 'AUC', 'early_stopping_rounds': 48, 'max_bin': 357, 'learning_rate': 0.039926066217791725, 'depth': 11, 'num_boost_round': 4349, 'bagging_temperature': 5.753214521222872, 'subsample': 0.9181545312144901, 'colsample_bylevel': 0.6006232656816098, 'min_data_in_leaf': 353.5824003756882, 'l2_leaf_reg': 6.122291037651919, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'scale_pos_weight': 2.2637139313632977, 'random_strength': 6.881176315494768, 'leaf_estimation_method': 'Newton'}
lgb_params = {'objective': 'binary', 'metric': 'binary_logloss', 'early_stopping_rounds': 106, 'num_leaves': 269, 'max_depth': 19, 'learning_rate': 0.013021537162390245, 'num_boost_round': 855, 'subsample': 0.6447447783553862, 'colsample_bytree': 0.9820268958585274, 'reg_alpha': 0.0002825530953671368, 'reg_lambda': 0.010999933343859751, 'min_data_in_leaf': 1, 'feature_fraction': 0.6473240524904155, 'bagging_fraction': 0.881236488962646, 'bagging_freq': 4, 'min_child_weight': 0.0022282898086662837, 'cat_smooth': 7.573381488731368}
lgb_model = LGBMModel(**lgb_params)

cat_params = {'loss_function' : "Logloss", 'eval_metric': 'AUC', 'early_stopping_rounds': 16, 'max_bin': 462, 'learning_rate': 0.12391490493192958, 'depth': 14, 'iterations': 214, 'bagging_temperature': 4.149493858174805, 'subsample': 0.8299281128444709, 'colsample_bylevel': 0.8071761616090766, 'min_data_in_leaf': 385.7381455851395, 'l2_leaf_reg': 0.6124495112903168, 'grow_policy': 'Depthwise', 'leaf_estimation_iterations': 3, 'random_strength': 1.16839732170397, 'leaf_estimation_method': 'Newton'}
cat_model = CatBoostModel(**cat_params)

xgb_params = {"objective" : "binary:logistic", "eval_metric": 'logloss', 'max_bin': 422, 'learning_rate': 0.01412732955835383, 'max_depth': 46, 'num_boost_round': 902, 'gamma': 1.2242267941097515, 'min_child_weight': 0.36649080798388867, 'subsample': 0.9798023274013429, 'colsample_bytree': 0.7762824974417832, 'colsample_bylevel': 0.7315173512004598, 'colsample_bynode': 0.6666802095159101, 'reg_alpha': 2.9781427235209075e-06, 'reg_lambda': 0.2675868649587926, 'max_cat_threshold': 998, 'grow_policy': 'depthwise'}
xgb_model = XGBoostModel(**xgb_params)


model_list = [lgb_model, cat_model, xgb_model]
model_names = ['lgb', 'cat', 'xgb']
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

model_features = {'lgb' : None,
                  'xgb' : None,
                  'cat' : None}

hcobj = CreateHillClimber(train_df,
                  test_df,
                  model_list,
                  model_names,
                  model_features,
                  roc_auc_score,
                  target_col_name=target_col_name,
                  kfold=kf,
                  objective="maximize",
                  negative_weights=True
                  )

In [12]:
hillclimber = hcobj.fit()

####################################################################################################
OOF prediction score :  0.9658259204928148
Mean 10-cv results : 0.9657944910811527 +- 0.003012343891116537
####################################################################################################
####################################################################################################
OOF prediction score :  0.9646633118689777
Mean 10-cv results : 0.9647136537571595 +- 0.0024591467314678916
####################################################################################################
####################################################################################################
OOF prediction score :  0.9655216356707257
Mean 10-cv results : 0.9655134586943592 +- 0.0032356641826998787
####################################################################################################


In [6]:
hillclimber.train_oof_pred = hillclimber.train_oof_pred[['lgb', 'xgb', 'cat']]
hillclimber.test_pred = hillclimber.test_pred[['lgb', 'xgb', 'cat']]

NameError: name 'hillclimber' is not defined

In [14]:
hillclimber.train_oof_pred['cat'] = hillclimber.train_oof_pred['cat'].apply(trans)
hillclimber.train_oof_pred

Unnamed: 0,lgb,cat,xgb
0,0.011416,0.035596,0.012699
1,0.002840,0.004010,0.007108
2,0.003492,0.001944,0.006788
3,0.002796,0.004896,0.006102
4,0.003788,0.004768,0.004880
...,...,...,...
91221,0.016551,0.027081,0.030286
91222,0.001707,0.003484,0.002870
91223,0.969462,0.978935,0.951568
91224,0.001502,0.006854,0.002142


In [15]:
hillclimber.test_pred['cat'] = hillclimber.test_pred['cat'].apply(trans)
hillclimber.test_pred

Unnamed: 0,lgb,cat,xgb
0,0.996683,0.997072,0.991981
1,0.010240,0.022134,0.012267
2,0.581940,0.505568,0.606058
3,0.003692,0.005245,0.004791
4,0.041508,0.019356,0.033987
...,...,...,...
39093,0.030081,0.078537,0.040317
39094,0.005740,0.004374,0.008593
39095,0.006953,0.006722,0.009102
39096,0.184721,0.335335,0.213805


In [9]:
# hillclimber.test_pred = hillclimber.test_pred[['lgb', 'cat', 'xgb']]
# hillclimber.train_oof_pred = hillclimber.train_oof_pred[['lgb', 'cat', 'xgb']]

In [10]:
pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_oofs.csv", index_col=0).drop(columns=['GMM', 'BGM']).to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_oofs.csv")
pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_tests.csv", index_col=0).drop(columns=['GMM', 'BGM']).to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_tests.csv")

In [21]:
additional_predictions = {
                          "ag-additional-data" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/ag_addidata_train_oof_v2.csv",
                                                  "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/ag_addidata_test_v2.csv"),
                            # "ag-nofe" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/ag_train_oof.csv",
                            #              "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/ag_test.csv"),
                            # "laml-additional-data" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/train_oofs_laml_withogdata.csv",
                            #                           "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/test_laml_withogdata.csv"),
                            # "deeptable_martynov" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/deeptable_oof.csv",
                            #                         "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/deeptable_test.csv"),
                            # "deeptable_martynov_2" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/deeptable_oof (1).csv",
                            #                         "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/deeptable_test (1).csv"),
                            "martynov_oofs" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/martynov_oof.csv",
                                               "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/martynov_test.csv"),
                            # "tabnet" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/tabnet_oofs_with_martynovfeats.csv",
                            #             "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/tabnet_test_with_martynovfeats.csv"),
                            # "svc_undersampling" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/svc_undersampling_oof.csv",
                            #                        "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/svc_undersampling_test.csv"),
                            # "basic_estimators" : ("/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_oofs.csv",
                            #                       "/Users/yuwei-1/Documents/projects/Kaggle-tools/ktools/modelling/Data/loan_prediction/base_estimators_tests.csv")
                        }

hillclimber.load_saved_prediction_files(additional_predictions)

In [23]:
hillclimber.train_oof_pred.to_csv("loan_oof_v5.csv")

In [24]:
hillclimber.test_pred.to_csv("loan_test_v5.csv")

In [14]:
hillclimber.train_oof_pred

Unnamed: 0,lgb,cat,xgb,LightGBMXT_BAG_L1ag-additional-data,LightGBM_BAG_L1ag-additional-data,RandomForestGini_BAG_L1ag-additional-data,RandomForestEntr_BAG_L1ag-additional-data,CatBoost_BAG_L1ag-additional-data,ExtraTreesGini_BAG_L1ag-additional-data,ExtraTreesEntr_BAG_L1ag-additional-data,...,QDAbasic_estimators,LDAbasic_estimators,XTbasic_estimators,decision_treebasic_estimators,adaboostbasic_estimators,gradientboostbasic_estimators,baggingbasic_estimators,XTSbasic_estimators,RFbasic_estimators,GPCbasic_estimators
0,0.014226,0.021258,0.020801,0.025074,0.010545,0.083333,0.138889,0.031832,0.092593,0.101852,...,0.120132,0.164303,0.2,0.0,0.388409,0.091961,0.10,0.162,0.090,0.138659
1,0.003147,0.002719,0.006064,0.005489,0.001916,0.029412,0.019608,0.002424,0.019608,0.009804,...,0.043716,0.053651,0.0,0.2,0.349088,0.053824,0.10,0.162,0.202,0.247230
2,0.003697,0.009254,0.006637,0.004046,0.016477,0.028302,0.037736,0.010584,0.009434,0.018868,...,0.003805,0.047983,0.0,0.2,0.326175,0.073487,0.28,0.104,0.202,0.143569
3,0.002413,0.003068,0.004092,0.001776,0.003833,0.044248,0.000000,0.003786,0.017699,0.017699,...,0.020451,0.048660,0.0,0.0,0.345135,0.022456,0.04,0.022,0.034,0.046544
4,0.003931,0.005164,0.006252,0.003480,0.004840,0.000000,0.000000,0.002612,0.009009,0.027027,...,0.006403,0.030718,0.0,0.0,0.398441,0.041849,0.00,0.024,0.004,0.073208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91221,0.012657,0.025221,0.036036,0.013164,0.004357,0.219298,0.140351,0.012337,0.078947,0.087719,...,,,,,,,,,,
91222,0.001326,0.001638,0.002615,0.004856,0.003078,0.000000,0.000000,0.000651,0.000000,0.000000,...,,,,,,,,,,
91223,0.981978,0.985224,0.941642,0.985294,0.984741,0.949153,0.991525,0.975828,0.974576,0.974576,...,,,,,,,,,,
91224,0.001442,0.004272,0.002232,0.002508,0.000892,0.018182,0.027273,0.003540,0.000000,0.009091,...,,,,,,,,,,


In [32]:
from sklearn.linear_model import Lasso, LogisticRegression

full_oof_pred = hillclimber.train_oof_pred.dropna()
las = LogisticRegression(C=100, fit_intercept=False)
las.fit(full_oof_pred, train_df.loc[full_oof_pred.index, target_col_name])

In [33]:
hillclimber.test_pred

Unnamed: 0,lgb,cat,xgb,LightGBMXT_BAG_L1ag-additional-data,LightGBM_BAG_L1ag-additional-data,RandomForestGini_BAG_L1ag-additional-data,RandomForestEntr_BAG_L1ag-additional-data,CatBoost_BAG_L1ag-additional-data,ExtraTreesGini_BAG_L1ag-additional-data,ExtraTreesEntr_BAG_L1ag-additional-data,...,RandomForest_r39_BAG_L1ag-nofe,CatBoost_r167_BAG_L1ag-nofe,NeuralNetFastAI_r95_BAG_L1ag-nofe,NeuralNetTorch_r41_BAG_L1ag-nofe,XGBoost_r98_BAG_L1ag-nofe,LightGBM_r15_BAG_L1ag-nofe,NeuralNetTorch_r158_BAG_L1ag-nofe,CatBoost_r86_BAG_L1ag-nofe,WeightedEnsemble_L2ag-nofe,0laml-additional-data
0,0.997030,0.997428,0.992341,0.999813,0.995796,0.983333,0.976667,0.999628,0.923333,0.946667,...,0.996333,0.996693,0.930515,0.957836,0.986479,0.994915,0.937757,0.937724,0.995175,0.998348
1,0.010851,0.023365,0.011272,0.029075,0.009837,0.040000,0.046667,0.016720,0.043333,0.073333,...,0.002778,0.019664,0.023771,0.024897,0.016375,0.013611,0.042633,0.063430,0.015376,0.010639
2,0.562143,0.498887,0.593771,0.549851,0.410048,0.623333,0.570000,0.483347,0.660000,0.633333,...,0.563341,0.707948,0.305220,0.447161,0.801875,0.569208,0.664766,0.416801,0.652779,0.595305
3,0.003650,0.006296,0.004365,0.004551,0.003820,0.023333,0.020000,0.005655,0.020000,0.033333,...,0.014722,0.012357,0.032955,0.039867,0.011212,0.009408,0.051369,0.094317,0.009955,0.003465
4,0.036904,0.023923,0.035411,0.043241,0.045630,0.116667,0.090000,0.023746,0.260000,0.270000,...,0.054694,0.052700,0.068436,0.113836,0.056274,0.060336,0.158422,0.248756,0.042491,0.056937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,0.028807,0.082250,0.042965,0.049189,0.054528,0.060000,0.070000,0.083882,0.056667,0.030000,...,0.005000,0.044348,0.051155,0.026284,0.015909,0.036416,0.091418,0.097896,0.038428,0.056589
39094,0.005697,0.003305,0.009070,0.004969,0.014048,0.000000,0.003333,0.002705,0.006667,0.006667,...,0.000000,0.004100,0.007533,0.004880,0.002843,0.005311,0.017091,0.045498,0.004431,0.004835
39095,0.006678,0.007320,0.008760,0.017796,0.009321,0.016667,0.006667,0.005421,0.040000,0.013333,...,0.021444,0.007687,0.019337,0.020693,0.009874,0.008082,0.025310,0.054293,0.007689,0.006360
39096,0.198394,0.352681,0.236086,0.250446,0.195642,0.393333,0.426667,0.191345,0.496667,0.500000,...,0.189325,0.170397,0.244831,0.230160,0.228370,0.250709,0.302021,0.334895,0.204178,0.164957


In [34]:
las.predict_proba(hillclimber.test_pred)[:, 1]

array([0.86538166, 0.08495817, 0.85884227, ..., 0.18229488, 0.00194373,
       0.98636737])

In [35]:
sample_sub = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/sample_submission.csv")
sample_sub['loan_status'] =  las.predict_proba(hillclimber.test_pred)[:, 1]
sample_sub.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/submissions/loan_approval/submission_v6.csv", index=False)
sample_sub

Unnamed: 0,id,loan_status
0,58645,0.865382
1,58646,0.084958
2,58647,0.858842
3,58648,0.046862
4,58649,0.000377
...,...,...
39093,97738,0.062906
39094,97739,0.153032
39095,97740,0.182295
39096,97741,0.001944


In [6]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.models.lgbm_model import LGBMModel


lgb_model = LGBMModel(**{'objective': 'binary',
                        'metric': 'binary_logloss',})
kf = StratifiedKFold(5, shuffle=True, random_state=42)

score_tuple, oof_preds, model_list = CrossValidationExecutor(lgb_model,
                                                            roc_auc_score,
                                                            kf,
                                                            ).run(train_df.drop(columns=target_col_name), train_df[target_col_name])

####################################################################################################
OOF prediction score :  0.9621543992612214
Mean 5-cv results : 0.9621896554584352 +- 0.0020320088294656026
####################################################################################################


In [7]:
test_df.drop(columns='loan_status', inplace=True)

In [8]:
import numpy as np


test_preds = np.zeros(len(test_df))
for model in model_list:
    test_preds += model.predict(test_df)/len(model_list)

In [9]:
sample_sub = pd.read_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/sample_submission.csv")
sample_sub['loan_status'] =  test_preds
sample_sub.to_csv("/Users/yuwei-1/Documents/projects/Kaggle-tools/submissions/loan_approval/submission_v3.csv", index=False)
sample_sub

Unnamed: 0,id,loan_status
0,58645,0.990776
1,58646,0.010952
2,58647,0.399783
3,58648,0.004929
4,58649,0.079045
...,...,...
39093,97738,0.054753
39094,97739,0.020923
39095,97740,0.010368
39096,97741,0.232539


In [10]:
test_preds

array([0.98138743, 0.01408556, 0.59922634, ..., 0.01065085, 0.30067872,
       0.9498485 ])

In [26]:
from ktools.modelling.models.xgb_model import XGBoostModel


xgb_model = XGBoostModel(**{'objective' : "binary:logistic",   # Binary classification (logistic regression)
                            'eval_metric': 'logloss', })
kf = KFold(5, shuffle=True, random_state=42)

CrossValidationExecutor(xgb_model,
                        roc_auc_score,
                        kf,
).run(train_df.drop(columns=target_col_name), train_df[target_col_name])

####################################################################################################
OOF prediction score :  0.953419646838146
Mean 5-cv results : 0.9534432969418296 +- 0.0021088037568141494
####################################################################################################


((0.953419646838146, 0.9534432969418296),
 array([0.04427775, 0.00445247, 0.00233521, ..., 0.36242941, 0.06712675,
        0.00217719]),
 [<ktools.modelling.models.xgb_model.XGBoostModel at 0x34d180df0>,
  <ktools.modelling.models.xgb_model.XGBoostModel at 0x34c6e63b0>,
  <ktools.modelling.models.xgb_model.XGBoostModel at 0x34d1eeda0>,
  <ktools.modelling.models.xgb_model.XGBoostModel at 0x34d180bb0>,
  <ktools.modelling.models.xgb_model.XGBoostModel at 0x34d182e60>])

In [35]:
from ktools.modelling.create_hill_climber_from_params import CreateHillClimber
from ktools.modelling.models.hgb_model import HGBModel

model_list = [xgb_model, lgb_model, cat_model]
model_names = ['xgb', 'lgb', 'cat']

model_features = {'xgb' : None,
                  'lgb' : None,
                  'cat' : None}

hcobj = CreateHillClimber(train_df,
                  test_df,
                  model_list,
                  model_names,
                  model_features,
                  roc_auc_score,
                  target_col_name='loan_status',
                  kfold=kf,
                  objective="maximize"
                  )

In [36]:
hillclimb = hcobj.fit()

####################################################################################################
OOF prediction score :  0.953419646838146
Mean 5-cv results : 0.9534432969418296 +- 0.0021088037568141494
####################################################################################################
####################################################################################################
OOF prediction score :  0.9560351768875015
Mean 5-cv results : 0.9560568097181154 +- 0.0033859420310994436
####################################################################################################
####################################################################################################
OOF prediction score :  0.9456478977624828
Mean 5-cv results : 0.9458090534730907 +- 0.0046198481744466995
####################################################################################################


In [37]:
hillclimb.naive_hill_climb()

[1m[34m   /\  
  /__\  hillclimbers[0m[1m 
 /    \
/______\ 
[0m
[1m[33mModels to be ensembled | (3 total):[0m 

[1m[32mlgb: 0.95604 (best solo model)[0m
[1mxgb: 0.95342[0m
[1mcat: 0.94565[0m

[1m[33m[Data preparation completed successfully] - [Initiate hill climbing][0m 

[1m[32mIteration: 1 | Model added: xgb | Best weight: 0.29 | Best roc_auc_score: 0.95696[0m


array([0.98591825, 0.01407396, 0.59817675, ..., 0.01050387, 0.24724401,
       0.94434341])