# import

In [12]:
import pandas as pd
import os
import numpy as np
import gc

In [13]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# Function

In [14]:
import pandas as pd

def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df[col] = df[col].astype('Int64')
        elif col in ["date_decision"]:
            df[col] = pd.to_datetime(df[col])
        elif col[-1] in ("P", "A"):
            df[col] = df[col].astype(float)
        elif col[-1] in ("D",):
            df[col] = pd.to_datetime(df[col])
    return df

def convert_strings(df):
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            df[col] = df[col].astype(pd.CategoricalDtype(categories=new_categories, ordered=True))
    return df


In [15]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        try:
          if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
          else:
            continue
        except: continue
    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# read feature file you do already

In [None]:
df_train = pd.read_parquet('/content/train_fea_final_v2.parquet')
df_train = set_table_dtypes(df_train)
df_train

Unnamed: 0,case_id,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,...,annuity_853A,credacc_credlmt_575A,credamount_590A,currdebt_94A,downpmt_134A,mainoccupationinc_437A,maxdpdtolerance_577P,outstandingdebt_522A,amount_4527230A,amount_4917619A
0,0,0,,,1917.6000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,0,,,3134.0000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2,0,,,4937.0000,0.0,0.0,0.0,0.0,0.0,...,1161.3,0.0,13000.0,,0.0,8200.0,,,,
3,3,0,,,4643.6000,0.0,0.0,1.0,0.0,2.0,...,6140.0,0.0,59999.8,,0.0,11000.0,,,,
4,4,1,,,3390.2000,0.0,0.0,1.0,0.0,0.0,...,2556.6,0.0,40000.0,,0.0,16000.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,0,0.0,176561.36,3675.4001,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1526655,2703451,0,0.0,301276.47,7088.6000,6191.6,0.0,0.0,5.0,0.0,...,,,,,,,,,,
1526656,2703452,0,0.0,14232.40,7788.8003,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1526657,2703453,0,0.0,197371.58,1195.4000,2827.2,0.0,0.0,36.0,0.0,...,,,,,,,,,,


In [16]:
df_test = pd.read_parquet('/content/test_fea_v2xxx.parquet')
df_test = set_table_dtypes(df_test)
df_test

Unnamed: 0,case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,...,annuity_853A,credacc_credlmt_575A,credamount_590A,currdebt_94A,downpmt_134A,mainoccupationinc_437A,maxdpdtolerance_577P,outstandingdebt_522A,amount_4527230A,amount_4917619A
0,14256,0.0,261076.58,7214.8003,8845.8000,0.0,0.0,0.0,0.0,0.0,...,3758.000008,0.000000,39667.014429,8251.832400,0.000000,66200.000000,4.333333,10746.497200,,
1,1348,0.0,43920.20,1113.2001,0.0000,0.0,0.0,0.0,0.0,0.0,...,2109.600025,0.000000,13250.000000,0.000000,0.000000,39600.000000,1.666667,0.000000,968.800000,
2,13475,0.0,348415.47,1746.0000,1059.0000,0.0,10.0,0.0,0.0,0.0,...,3455.210075,4500.000000,49287.670000,19475.871667,0.000000,25983.333333,1.666667,36462.112571,,
3,1120,0.0,68781.40,9718.2000,1726.6000,0.0,1.0,0.0,0.0,0.0,...,4475.533350,36300.466667,37383.166667,22651.650000,0.000000,33666.666667,8.250000,22651.650000,4476.600100,34885.633333
4,11878,0.0,52085.20,1788.0000,5960.8003,0.0,0.0,0.0,0.0,0.0,...,1955.525038,2283.750000,25225.975000,5875.285714,500.000000,45428.571429,0.166667,804.052571,,16080.371571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,3444,0.0,15261.80,1343.6000,5272.8003,0.0,0.0,0.0,0.0,0.0,...,3700.200000,0.000000,42300.033333,9782.333500,0.000000,40000.000000,0.666667,11738.800200,1649.420699,
19996,19016,0.0,25281.00,9427.2000,0.0000,0.0,0.0,0.0,0.0,0.0,...,3288.100000,0.000000,40550.000000,0.000000,0.000000,45750.000000,0.000000,0.000000,,12175.028714
19997,4171,0.0,40704.00,4530.0000,0.0000,0.0,0.0,0.0,0.0,0.0,...,3202.442864,6024.680067,46166.293467,0.000000,64.253335,39249.333333,3.000000,3124.750000,2115.500083,
19998,10063,0.0,0.00,1136.8000,0.0000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,572.450000


# train test split

In [None]:
list_col = df_test.columns.tolist()
list_col

['case_id',
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avgmaxdpdlast9m_3716943P',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_100L',
 'clientscnt_1022L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'commnoinclast6m_3546845L',
 'credamount_770A',
 'credtype_322L',
 'currdebt_22A',
 'currdebtcredtyperange_828A',
 'daysoverduetolerancedd_3976961L',
 'deferredmnthsnum_166L',
 '

In [None]:
typee = []
for i in list_col:
    # typee.append(df_test[i].dtype.name)
  if df_test[i].dtype.name not in ['Int64', 'float64']:
    df_test.drop(columns=i, inplace=True)
    df_train.drop(columns=i, inplace=True)

In [None]:
df_train = convert_strings(df_train)
df_train = reduce_mem_usage(df_train)

df_test = convert_strings(df_test)

Memory usage after optimization is: 446.97 MB
Decreased by 69.6%


In [None]:
df_train.columns.tolist()

['case_id',
 'target',
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avgmaxdpdlast9m_3716943P',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_100L',
 'clientscnt_1022L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'commnoinclast6m_3546845L',
 'credamount_770A',
 'currdebt_22A',
 'currdebtcredtyperange_828A',
 'daysoverduetolerancedd_3976961L',
 'deferredmnthsnum_166L',
 'disburs

In [None]:
feature = [
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avgmaxdpdlast9m_3716943P',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_100L',
 'clientscnt_1022L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'commnoinclast6m_3546845L',
 'credamount_770A',
 'currdebt_22A',
 'currdebtcredtyperange_828A',
 'daysoverduetolerancedd_3976961L',
 'deferredmnthsnum_166L',
 'disbursedcredamount_1113A',
 'downpmt_116A',
 'eir_270L',
 'homephncnt_628L',
 'interestrate_311L',
 'lastapprcredamount_781A',
 'maininc_215A',
 'mastercontrelectronic_519L',
 'mastercontrexist_109L',
 'maxannuity_159A',
 'maxdbddpdlast1m_3658939P',
 'maxdbddpdtollast12m_3658940P',
 'maxdbddpdtollast6m_4187119P',
 'maxdebt4_972A',
 'maxdpdfrom6mto36m_3546853P',
 'maxdpdinstlnum_3546846P',
 'maxdpdlast12m_727P',
 'maxdpdlast24m_143P',
 'maxdpdlast3m_392P',
 'maxdpdlast6m_474P',
 'maxdpdlast9m_1059P',
 'maxdpdtolerance_374P',
 'maxinstallast24m_3658928A',
 'mindbddpdlast24m_3658935P',
 'mindbdtollast24m_4525191P',
 'mobilephncnt_593L',
 'monthsannuity_845L',
 'numactivecreds_622L',
 'numactivecredschannel_414L',
 'numactiverelcontr_750L',
 'numcontrs3months_479L',
 'numincomingpmts_3546848L',
 'numinstlallpaidearly3d_817L',
 'numinstls_657L',
 'numinstlsallpaid_934L',
 'numinstlswithdpd10_728L',
 'numinstlswithdpd5_4187116L',
 'numinstlswithoutdpd_562L',
 'numinstpaidearly3d_3546850L',
 'numinstpaidearly5d_1087L',
 'numinstpaidearly_338L',
 'numinstpaidlate1d_3546852L',
 'numinstregularpaid_973L',
 'numinsttopaygr_769L',
 'numinstunpaidmax_3546851L',
 'numnotactivated_1143L',
 'numpmtchanneldd_318L',
 'numrejects9m_859L',
 'pctinstlsallpaidearl3d_427L',
 'pctinstlsallpaidlat10d_839L',
 'pctinstlsallpaidlate1d_3546856L',
 'pctinstlsallpaidlate4d_3546849L',
 'pctinstlsallpaidlate6d_3546844L',
 'pmtnum_254L',
 'posfpd10lastmonth_333P',
 'posfpd30lastmonth_3976960P',
 'posfstqpd30lastmonth_3976962P',
 'price_1097A',
 'sellerplacecnt_915L',
 'sellerplacescnt_216L',
 'sumoutstandtotal_3546847A',
 'totaldebt_9A',
 'totalsettled_863A',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'firstquarter_103L',
 'fourthquarter_440L',
 'numberofqueries_373L',
 'pmtssum_45A',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'actualdpd_943P',
 'annuity_853A',
 'credacc_credlmt_575A',
 'credamount_590A',
 'currdebt_94A',
 'downpmt_134A',
 'mainoccupationinc_437A',
 'maxdpdtolerance_577P',
 'outstandingdebt_522A',
 'amount_4527230A',
 'amount_4917619A']

target = ['target']

In [None]:
X = df_train[feature]
y = df_train[target]

In [None]:
# train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Train + validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
# check shape
print(f"Train: {X_train.shape}")
print(f"Train: {X_val.shape}")
print(f"Test: {X_test.shape}")

Train: (1236593, 124)
Train: (137400, 124)
Test: (152666, 124)


# for GridSearch part

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'num_leaves': [28, 31, 33],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.005],
    'n_estimators': [700, 800, 900]
}

In [None]:
lgb_model = lgb.LGBMClassifier()

In [None]:
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 25980, number of negative: 798415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.140576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17492
[LightGBM] [Info] Number of data points in the train set: 824395, number of used features: 122
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031514 -> initscore=-3.425302
[LightGBM] [Info] Start training from score -3.425302
[LightGBM] [Info] Number of positive: 25981, number of negative: 798415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17483
[LightGBM] [Info] Number of data points 

In [None]:
# using param in grid_search.best_params_

# For model

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 5,
    "num_leaves": 31,
    "learning_rate": 0.01,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 800,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(20)]
)

Training until validation scores don't improve for 20 rounds
[50]	valid_0's auc: 0.743996
[100]	valid_0's auc: 0.751009
[150]	valid_0's auc: 0.760091
[200]	valid_0's auc: 0.766355
[250]	valid_0's auc: 0.770732
[300]	valid_0's auc: 0.774402
[350]	valid_0's auc: 0.776939
[400]	valid_0's auc: 0.779354
[450]	valid_0's auc: 0.780965
[500]	valid_0's auc: 0.78255
[550]	valid_0's auc: 0.783939
[600]	valid_0's auc: 0.78543
[650]	valid_0's auc: 0.786486
[700]	valid_0's auc: 0.78737
[750]	valid_0's auc: 0.788177
[800]	valid_0's auc: 0.788898
Did not meet early stopping. Best iteration is:
[800]	valid_0's auc: 0.788898


In [None]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred

array([0.00574188, 0.03281887, 0.00590299, ..., 0.01558536, 0.02072047,
       0.05665999])

# Select Feature and Predict

In [None]:
df_test.columns.tolist()

['case_id',
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avgmaxdpdlast9m_3716943P',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_100L',
 'clientscnt_1022L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'commnoinclast6m_3546845L',
 'credamount_770A',
 'currdebt_22A',
 'currdebtcredtyperange_828A',
 'daysoverduetolerancedd_3976961L',
 'deferredmnthsnum_166L',
 'disbursedcredamoun

In [25]:
featuress =[
 'actualdpdtolerance_344P',
 'amtinstpaidbefduel24m_4187115A',
 'annuity_780A',
 'annuitynextmonth_57A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_629L',
 'applicationscnt_867L',
 'avgdbddpdlast24m_3658932P',
 'avgdbddpdlast3m_4187120P',
 'avgdbdtollast24m_4525197P',
 'avgdpdtolclosure24_3658938P',
 'avginstallast24m_3658937A',
 'avgmaxdpdlast9m_3716943P',
 'clientscnt12m_3712952L',
 'clientscnt3m_3712950L',
 'clientscnt6m_3712949L',
 'clientscnt_100L',
 'clientscnt_1022L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'cntincpaycont9m_3716944L',
 'cntpmts24_3658933L',
 'commnoinclast6m_3546845L',
 'credamount_770A',
 'currdebt_22A',
 'currdebtcredtyperange_828A',
 'daysoverduetolerancedd_3976961L',
 'deferredmnthsnum_166L',
 'disbursedcredamount_1113A',
 'downpmt_116A',
 'eir_270L',
 'homephncnt_628L',
 'interestrate_311L',
 'lastapprcredamount_781A',
 'maininc_215A',
 'mastercontrelectronic_519L',
 'mastercontrexist_109L',
 'maxannuity_159A',
 'maxdbddpdlast1m_3658939P',
 'maxdbddpdtollast12m_3658940P',
 'maxdbddpdtollast6m_4187119P',
 'maxdebt4_972A',
 'maxdpdfrom6mto36m_3546853P',
 'maxdpdinstlnum_3546846P',
 'maxdpdlast12m_727P',
 'maxdpdlast24m_143P',
 'maxdpdlast3m_392P',
 'maxdpdlast6m_474P',
 'maxdpdlast9m_1059P',
 'maxdpdtolerance_374P',
 'maxinstallast24m_3658928A',
 'mindbddpdlast24m_3658935P',
 'mindbdtollast24m_4525191P',
 'mobilephncnt_593L',
 'monthsannuity_845L',
 'numactivecreds_622L',
 'numactivecredschannel_414L',
 'numactiverelcontr_750L',
 'numcontrs3months_479L',
 'numincomingpmts_3546848L',
 'numinstlallpaidearly3d_817L',
 'numinstls_657L',
 'numinstlsallpaid_934L',
 'numinstlswithdpd10_728L',
 'numinstlswithdpd5_4187116L',
 'numinstlswithoutdpd_562L',
 'numinstpaidearly3d_3546850L',
 'numinstpaidearly5d_1087L',
 'numinstpaidearly_338L',
 'numinstpaidlate1d_3546852L',
 'numinstregularpaid_973L',
 'numinsttopaygr_769L',
 'numinstunpaidmax_3546851L',
 'numnotactivated_1143L',
 'numpmtchanneldd_318L',
 'numrejects9m_859L',
 'pctinstlsallpaidearl3d_427L',
 'pctinstlsallpaidlat10d_839L',
 'pctinstlsallpaidlate1d_3546856L',
 'pctinstlsallpaidlate4d_3546849L',
 'pctinstlsallpaidlate6d_3546844L',
 'pmtnum_254L',
 'posfpd10lastmonth_333P',
 'posfpd30lastmonth_3976960P',
 'posfstqpd30lastmonth_3976962P',
 'price_1097A',
 'sellerplacecnt_915L',
 'sellerplacescnt_216L',
 'sumoutstandtotal_3546847A',
 'totaldebt_9A',
 'totalsettled_863A',
 'days120_123L',
 'days180_256L',
 'days30_165L',
 'days360_512L',
 'days90_310L',
 'firstquarter_103L',
 'fourthquarter_440L',
 'numberofqueries_373L',
 'pmtssum_45A',
 'secondquarter_766L',
 'thirdquarter_1082L',
 'actualdpd_943P',
 'annuity_853A',
 'credacc_credlmt_575A',
 'credamount_590A',
 'currdebt_94A',
 'downpmt_134A',
 'mainoccupationinc_437A',
 'maxdpdtolerance_577P',
 'outstandingdebt_522A',
 'amount_4527230A',
 'amount_4917619A']

In [26]:
final_test = df_test[featuress]
final_test

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,...,annuity_853A,credacc_credlmt_575A,credamount_590A,currdebt_94A,downpmt_134A,mainoccupationinc_437A,maxdpdtolerance_577P,outstandingdebt_522A,amount_4527230A,amount_4917619A
0,0.0,261076.58,7214.8003,8845.8000,0.0,0.0,0.0,0.0,0.0,0.0,...,3758.000008,0.000000,39667.014429,8251.832400,0.000000,66200.000000,4.333333,10746.497200,,
1,0.0,43920.20,1113.2001,0.0000,0.0,0.0,0.0,0.0,0.0,3.0,...,2109.600025,0.000000,13250.000000,0.000000,0.000000,39600.000000,1.666667,0.000000,968.800000,
2,0.0,348415.47,1746.0000,1059.0000,0.0,10.0,0.0,0.0,0.0,0.0,...,3455.210075,4500.000000,49287.670000,19475.871667,0.000000,25983.333333,1.666667,36462.112571,,
3,0.0,68781.40,9718.2000,1726.6000,0.0,1.0,0.0,0.0,0.0,6.0,...,4475.533350,36300.466667,37383.166667,22651.650000,0.000000,33666.666667,8.250000,22651.650000,4476.600100,34885.633333
4,0.0,52085.20,1788.0000,5960.8003,0.0,0.0,0.0,0.0,0.0,8.0,...,1955.525038,2283.750000,25225.975000,5875.285714,500.000000,45428.571429,0.166667,804.052571,,16080.371571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.0,15261.80,1343.6000,5272.8003,0.0,0.0,0.0,0.0,0.0,5.0,...,3700.200000,0.000000,42300.033333,9782.333500,0.000000,40000.000000,0.666667,11738.800200,1649.420699,
19996,0.0,25281.00,9427.2000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,...,3288.100000,0.000000,40550.000000,0.000000,0.000000,45750.000000,0.000000,0.000000,,12175.028714
19997,0.0,40704.00,4530.0000,0.0000,0.0,0.0,0.0,0.0,0.0,11.0,...,3202.442864,6024.680067,46166.293467,0.000000,64.253335,39249.333333,3.000000,3124.750000,2115.500083,
19998,0.0,0.00,1136.8000,0.0000,0.0,0.0,0.0,0.0,0.0,2.0,...,,,,,,,,,,572.450000


# final predict

In [None]:
# for lgbm only!!!
y_pred = gbm.predict(final_test, num_iteration=gbm.best_iteration)
y_pred

array([0.01942732, 0.02413704, 0.06092794, ..., 0.06232982, 0.02453776,
       0.0189972 ])

In [27]:
final_answer = pd.DataFrame()
final_answer['case_id'] = df_test['case_id']
final_answer

Unnamed: 0,case_id
0,14256
1,1348
2,13475
3,1120
4,11878
...,...
19995,3444
19996,19016
19997,4171
19998,10063


In [None]:
final_answer['target'] = y_pred

## merge for true index with sample submition

In [30]:
sample_submition = pd.read_csv('/content/final_submition_v2_gridsearch.csv')
sample_submition.drop(columns='target',inplace=True)
sample_submition

Unnamed: 0,case_id
0,16791
1,12423
2,19352
3,17099
4,7491
...,...
19995,1268
19996,17450
19997,15421
19998,10109


In [31]:
final_merged = pd.merge(sample_submition, final_answer, on='case_id', how='left')
final_merged

Unnamed: 0,case_id,target
0,16791,0.000523
1,12423,0.093841
2,19352,0.006999
3,17099,0.016256
4,7491,0.022787
...,...,...
19995,1268,0.052254
19996,17450,0.008908
19997,15421,0.002098
19998,10109,0.002431


In [32]:
final_merged.to_csv('submission.csv', index=False)