In [1]:
import os, sys, gc, warnings, random
import numpy as np 
import pandas as pd 
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb  
from catboost import CatBoostClassifier ,Pool

from sklearn.metrics import auc, classification_report, roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [115]:
class CFG :
  SEED = 42
  n_splits = 5
  catboost_params = {'learning_rate':0.05,'iterations':10000,'eval_metric':'AUC',
                      'use_best_model' :True,'verbose':100,'random_seed': 0,
                      'devices':'0:1','task_type':"GPU",}

  lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc','n_estimators': 1500}
                #'n_estimators': 500,'sub_sample' : 0.7,'colsample_bytree' : 0.6,
                #'seed': SEED,'silent':False,'early_stopping_rounds': 100,
               
  #remove_features = ['ID', 'country', 'region','target']
  categ_features =['count_approveddate_y','count_bank_account_type','count_birthdate','count_creationdate_y','count_employment_status_clients','count_bank_name_clients','count_firstrepaid','count_firstdue','count_creation_y','count_closeddate']
   
  TARGET_COL = 'good_bad_flag'

In [116]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

In [117]:
def get_processed_data() :
    
    train_dem=pd.read_csv('traindemographics.csv')
    train_perf=pd.read_csv('trainperf.csv')
    train_pre=pd.read_csv('trainprevloans.csv')

    test_dem=pd.read_csv('testdemographics.csv')
    test_perf=pd.read_csv('testperf.csv')
    test_pre=pd.read_csv('testprevloans.csv')
    train=train_pre.merge(train_perf,on='customerid')
    trai=train.merge(train_dem,on='customerid')

    test=test_pre.merge(test_perf,on='customerid')
    test=test.merge(test_dem,on='customerid')
    data = pd.concat([trai, test]).reset_index(drop=True)
    train=data.copy()
        
    import datetime
    train['approveddate_y'] = pd.to_datetime(train['approveddate_y'],errors='coerce')
    #test['approveddate_y'] = pd.to_datetime(test['approveddate_y'],errors='coerce')
    train['monthappy']=train['approveddate_y'].apply(lambda x:x.month)
    train['yearappy']=train['approveddate_y'].apply(lambda x:x.day)
    train['yeary']=train['approveddate_y'].apply(lambda x:x.year)
    train['yy']=train['approveddate_y'].apply(lambda x:x.weekday)
    train['yy1']=train['approveddate_y'].apply(lambda x:x.weekofyear)
    train['uu']=train['approveddate_y'].apply(lambda x:x.week)
    t='''test['yy']=test['approveddate_y'].apply(lambda x:x.weekday)
    test['yy1']=test['approveddate_y'].apply(lambda x:x.weekofyear)
    test['uu']=test['approveddate_y'].apply(lambda x:x.week)
    '''
    
    train['approveddate_x'] = pd.to_datetime(train['approveddate_x'])
    #test['approveddate_x'] = pd.to_datetime(test['approveddate_x'])
    #train['monthappy1']=train['approveddate_x'].apply(lambda x:x.month)
    #train['yearappy1']=train['approveddate_x'].apply(lambda x:x.day)
    #train['yeary1']=train['approveddate_x'].apply(lambda x:x.year)
    #train['yy1']=train['approveddate_x'].apply(lambda x:x.weekday)
    #test['yy1']=test['approveddate_x'].apply(lambda x:x.weekday)
    #train['yy11']=train['approveddate_x'].apply(lambda x:x.weekofyear)
    #train['uu2']=train['approveddate_x'].apply(lambda x:x.week)

    train['creation_x'] = pd.to_datetime(train['creationdate_x'])
    #test['creation_x'] = pd.to_datetime(test['creationdate_x'])
    
    #train['month']=train['creation_x'].apply(lambda x:x.month)
    #train['year']=train['creation_x'].apply(lambda x:x.day)
    #train['yea']=train['creation_x'].apply(lambda x:x.year)
    #train['wday']=train['creation_x'].apply(lambda x:x.weekday)
    #train['wyear']=train['creation_x'].apply(lambda x:x.weekofyear)
    #train['wweek']=train['creation_x'].apply(lambda x:x.week)



    train['creation_y'] = pd.to_datetime(train['creationdate_y'],errors='coerce')
    #train['month0']=train['creation_y'].apply(lambda x:x.month)
    train['year0']=train['creation_y'].apply(lambda x:x.day)
    #train['yea0']=train['creation_y'].apply(lambda x:x.year)
    train['wday0']=train['creation_y'].apply(lambda x:x.weekday)
    train['wyear0']=train['creation_y'].apply(lambda x:x.weekofyear)
    train['wweek0']=train['creation_y'].apply(lambda x:x.week)

    train['firstdue'] = pd.to_datetime(train['firstduedate'])
    train['month01']=train['firstdue'].apply(lambda x:x.month)
    train['year01']=train['firstdue'].apply(lambda x:x.day)
    train['yea01']=train['firstdue'].apply(lambda x:x.year)
    train['wday01']=train['firstdue'].apply(lambda x:x.weekday)
    train['wyear01']=train['firstdue'].apply(lambda x:x.weekofyear)
    train['wweek01']=train['firstdue'].apply(lambda x:x.week)
    t='''test['firstdue'] = pd.to_datetime(test['firstduedate'])
    test['month01']=test['firstdue'].apply(lambda x:x.month)
    test['year01']=test['firstdue'].apply(lambda x:x.day)
    test['yea01']=test['firstdue'].apply(lambda x:x.year)
    test['wday01']=test['firstdue'].apply(lambda x:x.weekday)
    test['wyear01']=test['firstdue'].apply(lambda x:x.weekofyear)
    test['wweek01']=test['firstdue'].apply(lambda x:x.week)'''
    
    

    train['firstrepaid'] = pd.to_datetime(train['firstrepaiddate'])
    train['month011']=train['firstrepaid'].apply(lambda x:x.month)
    train['year011']=train['firstrepaid'].apply(lambda x:x.day)
    train['yea011']=train['firstrepaid'].apply(lambda x:x.year)
    train['wday011']=train['firstrepaid'].apply(lambda x:x.weekday)
    train['wyear011']=train['firstrepaid'].apply(lambda x:x.weekofyear)
    train['wweek011']=train['firstrepaid'].apply(lambda x:x.week)
    t='''test['firstrepaid'] = pd.to_datetime(test['firstrepaiddate'])
    test['month011']=test['firstrepaid'].apply(lambda x:x.month)
    test['year011']=test['firstrepaid'].apply(lambda x:x.day)
    test['yea011']=test['firstrepaid'].apply(lambda x:x.year)
    test['wday011']=test['firstrepaid'].apply(lambda x:x.weekday)
    test['wyear011']=test['firstrepaid'].apply(lambda x:x.weekofyear)
    test['wweek011']=test['firstrepaid'].apply(lambda x:x.week)'''


    train['closeddate'] = pd.to_datetime(train['closeddate'])
    train['monthclose']=train['closeddate'].apply(lambda x:x.month)
    train['yearclose']=train['closeddate'].apply(lambda x:x.day)
    train['yeaclose']=train['closeddate'].apply(lambda x:x.year)
    train['wdayclose']=train['closeddate'].apply(lambda x:x.weekday)
    train['wyearclose']=train['closeddate'].apply(lambda x:x.weekofyear)
    train['wweekclose']=train['closeddate'].apply(lambda x:x.week)
   
    train['birthdate'] = pd.to_datetime(train['birthdate'])
    train['monthclose1']=train['birthdate'].apply(lambda x:x.month)
    train['yearclose1']=train['birthdate'].apply(lambda x:x.day)
    train['yeaclose1']=train['birthdate'].apply(lambda x:x.year)
    train['employment_status_clients']=train['employment_status_clients'].fillna('grow')
    
    data=train.copy()
    data['elapse']=data['totaldue_y']-data['loanamount_y']
    
    
    
    col = ['approveddate_y','bank_account_type','closeddate','approveddate_y','creationdate_y','bank_account_type','birthdate','birthdate','creationdate_y','employment_status_clients','bank_name_clients','firstrepaid','firstdue','creation_y','closeddate']
    
    ## Count of unique features
    for i in col:
        data['count_'+i] = data[i].map(data[i].value_counts())
    # get train , test
    train = data[data['customerid'].isin(trai['customerid'].values)]
    test = data[data['customerid'].isin(test['customerid'].values)]
    features = [x for x in train.columns if x not in 
                ['systemloanid_y','firstrepaid','firstdue','creation_y','creation_x','wday011','approveddate_x','creationdate_x','bank_name_clients','good_bad_flag','wday0','wday01','yy','yy1','wdayclose','customerid','firstduedate','firstrepaiddate','systemloanid_x','loanamount_x','totaldue_x','termdays_x','loannumber_x','bank_branch_clients','bank_branch_clients','referredby_y','referredby_x','level_of_education_clients','closeddate','approveddate_y','creationdate_y','bank_account_type','birthdate','wday','birthdate','creationdate_y','employment_status_clients']]
    return train , test , features

In [118]:
#train_path = 'Train.csv' ; test_path = 'Test.csv'
train , test , features = get_processed_data()

In [119]:
X=train[features]
y=train['good_bad_flag'].map({'Good':1,'Bad':0})

In [120]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True, random_state=524)

X , y   = train[features] , train['good_bad_flag'].map({'Good':1,'Bad':0})

oof_lgb = np.zeros((train.shape[0],))
test[CFG.TARGET_COL]= 0
lgb_preds = []

for fold_, (trn_idx, val_idx) in enumerate(skf.split(X,y)):
    print(50*'-')
    print('Fold:',fold_+1)

    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx] 
    vl_x, vl_y = X.iloc[val_idx,:], y[val_idx] 
        
    train_data = lgb.Dataset(tr_x, label=tr_y,categorical_feature=CFG.categ_features)
    valid_data = lgb.Dataset(vl_x, label=vl_y,categorical_feature=CFG.categ_features)

    estimator = lgb.train(CFG.lgb_params,train_data,valid_sets = [train_data,valid_data],verbose_eval = 100)
    
    y_pred_val = estimator.predict(vl_x,num_iteration=estimator.best_iteration)
    oof_lgb[val_idx] = y_pred_val
    
    y_pred_test = estimator.predict(test[features],num_iteration=estimator.best_iteration)
    lgb_preds.append(y_pred_test)
    print(50*'-')

print('OOF score :',roc_auc_score(y, oof_lgb))

--------------------------------------------------
Fold: 1
[LightGBM] [Info] Number of positive: 8932, number of negative: 2022
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1489
[LightGBM] [Info] Number of data points in the train set: 10954, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.815410 -> initscore=1.485553
[LightGBM] [Info] Start training from score 1.485553
[100]	training's auc: 0.998909	valid_1's auc: 0.97439
[200]	training's auc: 0.999995	valid_1's auc: 0.981202
[300]	training's auc: 1	valid_1's auc: 0.983389
[400]	training's auc: 1	valid_1's auc: 0.983848
[500]	training's auc: 1	valid_1's auc: 0.985346
[600]	training's auc: 1	valid_1's auc: 0.985344
[700]	training's auc: 1	valid_1's auc: 0.985695
[800]	training's auc: 1	valid_1's auc: 0.986887
[900]	training's auc: 1	valid_1's auc: 0.987001
[1000]	training's auc: 1	valid_1's auc: 0.987559
[1100]	training's auc: 1	valid_1's auc: 0.987611
[1200]	train

In [121]:
cv=pd.read_csv('SampleSubmission.csv')
SUB_FILE_NAME = 'ee.csv' ;sub_df = test[['customerid']].copy() ; sub_df['ggood_Bad_flag'] = (np.mean(lgb_preds,axis=0)>0.5)*1

v=sub_df.to_csv(SUB_FILE_NAME, index=False)
#sub_df.head(10)

In [122]:
v=pd.read_csv('ee.csv')
v=v.rename(columns={'customerid':'hope'})
t=pd.concat([cv,v],axis=1)

t.drop(['hope','Good_Bad_flag'],axis=1,inplace=True)

t.dropna(inplace=True)

tv=t.rename(columns={'ggood_Bad_flag':'Good_Bad_flag'})

tv.to_csv('YusufJimoh.csv',index=False)