In [53]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier 
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Application Training Data

## Analyst

In [2]:
train_data = pd.read_csv('Data/application_train.csv')
test_data = pd.read_csv('Data/application_test.csv')

In [3]:
corr_array = train_data.corr(method ='pearson')
corr_array['TARGET'].sort_values(ascending=False).head(53)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
DAYS_LAST_PHONE_CHANGE          0.055218
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
OWN_CAR_AGE                     0.037612
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032248
DEF_60_CNT_SOCIAL_CIRCLE        0.031276
FLAG_WORK_PHONE                 0.028524
AMT_REQ_CREDIT_BUREAU_YEAR      0.019930
CNT_CHILDREN                    0.019187
CNT_FAM_MEMBERS                 0.009308
OBS_30_CNT_SOCIAL_CIRCLE        0.009131
OBS_60_CNT_SOCIAL_CIRCLE        0.009022
REG_REGION_NOT_WORK_REGION      0.006942
REG_REGION_NOT_LIVE_REGION      0.005576
FLAG_DOCUMENT_2                 0.005417
FLAG_DOCUMENT_21

In [4]:
corr_array['TARGET'].sort_values(ascending=True).head(53)

EXT_SOURCE_3                   -0.178919
EXT_SOURCE_2                   -0.160472
EXT_SOURCE_1                   -0.155317
DAYS_EMPLOYED                  -0.044932
FLOORSMAX_AVG                  -0.044003
FLOORSMAX_MEDI                 -0.043768
FLOORSMAX_MODE                 -0.043226
AMT_GOODS_PRICE                -0.039645
REGION_POPULATION_RELATIVE     -0.037227
ELEVATORS_AVG                  -0.034199
ELEVATORS_MEDI                 -0.033863
FLOORSMIN_AVG                  -0.033614
FLOORSMIN_MEDI                 -0.033394
LIVINGAREA_AVG                 -0.032997
LIVINGAREA_MEDI                -0.032739
FLOORSMIN_MODE                 -0.032698
TOTALAREA_MODE                 -0.032596
ELEVATORS_MODE                 -0.032131
LIVINGAREA_MODE                -0.030685
AMT_CREDIT                     -0.030369
APARTMENTS_AVG                 -0.029498
APARTMENTS_MEDI                -0.029184
FLAG_DOCUMENT_6                -0.028602
APARTMENTS_MODE                -0.027284
LIVINGAPARTMENTS

In [5]:
categorical_columns = [col for col in train_data.columns if train_data[col].dtype == 'object']
categorical_columns

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

# Functions for pre-processing

In [6]:
def categorical_to_onehot(data, nan_flag = True):
    columns_list = list(data.columns)
    categorical_feature = [col for col in data.columns if data[col].dtype == 'object']
    result = pd.get_dummies(data, columns= categorical_feature, dummy_na= nan_flag)
    new_columns = [c for c in result.columns if c not in columns_list]
    return result, new_columns

def groupby_rename(data, groupby_key, agg_list, rename_prefix = None):
    result = data.groupby(groupby_key).agg(agg_list)
    if rename_prefix == None:
        result.columns = pd.Index([feature[0] + "_" + feature[1].upper() for feature in result.columns.tolist()])
    else:
        result.columns = pd.Index([rename_prefix + "_" +feature[0] + "_" + feature[1].upper() for feature in result.columns.tolist()])
    return result

def test_corr (df1, df2):
    join_df = df1.merge(df2, how='left', on='SK_ID_CURR')
    corr_array = join_df.corr(method ='pearson')
    print(corr_array['TARGET'].sort_values(ascending=False).head(60))
    return

## Pre-Processing

In [7]:
data = train_data.append(test_data).reset_index()
data = data[data['CODE_GENDER']!= 'XNA'] # drop the row with CODE_GENDER = XNA

In [8]:
# feature intergation on EXT_SOURCES related features

data['EXT_SOURCES_SUM'] = data['EXT_SOURCE_1'] + data['EXT_SOURCE_2'] + data['EXT_SOURCE_3']
data['EXT_SOURCES_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
data['EXT_SOURCES_STD'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)

#fill nan value with mean
data['EXT_SOURCE_1'] = data['EXT_SOURCE_1'].fillna(data['EXT_SOURCE_1'].mean())
data['EXT_SOURCE_2'] = data['EXT_SOURCE_2'].fillna(data['EXT_SOURCE_2'].mean())
data['EXT_SOURCE_3'] = data['EXT_SOURCE_3'].fillna(data['EXT_SOURCE_3'].mean())

#fill nan value with mean
data['EXT_SOURCES_SUM'] = data['EXT_SOURCES_SUM'].fillna(data['EXT_SOURCES_SUM'].mean())
data['EXT_SOURCES_MEAN'] = data['EXT_SOURCES_MEAN'].fillna(data['EXT_SOURCES_MEAN'].mean())
data['EXT_SOURCES_STD'] = data['EXT_SOURCES_STD'].fillna(data['EXT_SOURCES_STD'].mean())


In [9]:
# feature intergation on other features
data['HAS_CHILDREN'] = (data['CNT_CHILDREN']!=0).astype(int)

data['AGE_GROUP'] = data['DAYS_BIRTH']//-365
data = data[data['AGE_GROUP']> 0]
data = data[data['AGE_GROUP']< 120]
bins= [0,30,45,65,120]
labels = [1,2,3,4]
data['AGE_GROUP'] = pd.cut(data['AGE_GROUP'], bins=bins, labels=labels, right=False)

In [10]:
# Transform categorical_feature into one hot vectors
categorical_feature = ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                       'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'WALLSMATERIAL_MODE','NAME_INCOME_TYPE']

data = pd.get_dummies(data, columns=categorical_feature, dummy_na=True)


In [11]:
# drop the column with low pearson correlation with TARGET or more than 60% is null value
drop_column = ['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_7','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13', 
               'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21', 'LIVE_REGION_NOT_WORK_REGION', 
               'CNT_CHILDREN', 'FLAG_CONT_MOBILE', 'FLAG_EMP_PHONE','FLAG_WORK_PHONE', 'FLAG_PHONE','FLAG_EMAIL','CNT_FAM_MEMBERS','HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION',
               'REG_REGION_NOT_WORK_REGION','BASEMENTAREA_AVG', 'BASEMENTAREA_MODE', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BEGINEXPLUATATION_MODE','YEARS_BUILD_AVG',
              'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI', 'COMMONAREA_AVG', 'COMMONAREA_MODE', 'COMMONAREA_MEDI', 'ELEVATORS_MODE', 'ELEVATORS_MEDI', 'ENTRANCES_MODE', 'ENTRANCES_MEDI', 
              'FLOORSMIN_AVG', 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', 'LANDAREA_AVG', 'LANDAREA_MODE', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_MEDI',
              'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_AVG', 'NONLIVINGAREA_MODE', 'NONLIVINGAREA_MEDI',
              'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_WEEK',
              'AMT_REQ_CREDIT_BUREAU_QRT', 'EMERGENCYSTATE_MODE',]

data.drop(columns=drop_column,axis=1, inplace=True)

In [12]:
# handle reminding nan values
data['OWN_CAR_AGE'] = data['OWN_CAR_AGE'].fillna(value=0)
#data = data.fillna(value=data.mean())

# handle abnormal value:
data['DAYS_EMPLOYED'].replace(365243, 0, inplace= True)

label_curr = data[['SK_ID_CURR', 'TARGET']]

# Bureau Data and bureau_balance Data

In [13]:
bureau_data  = pd.read_csv('Data/bureau.csv')
bureau_balance_data = pd.read_csv('Data/bureau_balance.csv')

In [14]:
# Transform categorical_feature into one hot vector
bureau_data, bureau_new_col = categorical_to_onehot(bureau_data, True)
bureau_balance_data, bureau_balance_new_col = categorical_to_onehot(bureau_balance_data, True)


In [15]:
# Perform aggregations on bureau_balance 
aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
for col in bureau_balance_new_col:
    aggregations[col] = ['mean']
agg_result = groupby_rename(bureau_balance_data, 'SK_ID_BUREAU', aggregations)
# merge with bureau.csv
bureau_data = bureau_data.merge(agg_result, how='left', on='SK_ID_BUREAU')
bureau_data.drop(['SK_ID_BUREAU'], axis=1, inplace= True)

In [16]:
# aggregations on numerical features
numerical_agg = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'DAYS_CREDIT_UPDATE': ['mean'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_ANNUITY': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}

categorical_agg = {}
for col in bureau_new_col: 
    categorical_agg[col] = ['mean']
for col in bureau_balance_new_col: 
    categorical_agg[col + "_MEAN"] = ['mean'] 

aggregations = {**numerical_agg, **categorical_agg}

In [17]:
bureau_agg_result = groupby_rename(bureau_data, 'SK_ID_CURR', aggregations, rename_prefix='BUREAU')

bureau_active = bureau_data[bureau_data['CREDIT_ACTIVE_Active'] == 1]
bureau_active_agg = groupby_rename(bureau_active, 'SK_ID_CURR', numerical_agg, rename_prefix='ACTIVE')
bureau_agg_result = bureau_agg_result.merge(bureau_active_agg, how='left', on='SK_ID_CURR')

bureau_closed = bureau_data[bureau_data['CREDIT_ACTIVE_Closed'] == 1]
bureau_closed_agg = groupby_rename(bureau_closed, 'SK_ID_CURR', numerical_agg, rename_prefix='CLOSED')

bureau_agg_result = bureau_agg_result.merge(bureau_closed_agg, how='left', on='SK_ID_CURR')

In [18]:
data = data.merge(bureau_agg_result, how='left', on='SK_ID_CURR')

# previous_applications.csv

In [19]:
previous_data = pd.read_csv('Data/previous_application.csv')

In [20]:
# Transform categorical_feature into one hot vector
previous_data, previous_new_col = categorical_to_onehot(previous_data, True)

# handle abnormal value:
abnormal_column = ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION']
for col in abnormal_column: previous_data[col].replace(365243, np.nan, inplace= True)


In [21]:
# setup aggregations setting on numerical and categorical feature
numerical_agg = {
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
}

categorical_agg = {}
for col in previous_new_col: 
    categorical_agg[col] = ['mean']
    
aggregations = {**numerical_agg, **categorical_agg}

In [22]:
# start aggregations
previous_agg_result = groupby_rename(previous_data, 'SK_ID_CURR', aggregations, rename_prefix='PREVIOUS')

previous_approved = previous_data[previous_data['NAME_CONTRACT_STATUS_Approved'] == 1]
previous_refused = previous_data[previous_data['NAME_CONTRACT_STATUS_Refused'] == 1]

previous_approved_agg = groupby_rename(previous_approved, 'SK_ID_CURR', numerical_agg, rename_prefix='APPROVED')
previous_refused_agg = groupby_rename(previous_refused, 'SK_ID_CURR', numerical_agg, rename_prefix='REFUSED')

previous_agg_result = previous_agg_result.merge(previous_approved_agg, how='left', on='SK_ID_CURR')
previous_agg_result = previous_agg_result.merge(previous_refused_agg, how='left', on='SK_ID_CURR')


In [23]:
data = data.merge(previous_agg_result, how='left', on='SK_ID_CURR')

# credit_card_balance.csv

In [24]:
credit_card_data = pd.read_csv('Data/credit_card_balance.csv')

In [25]:
# Transform categorical_feature into one hot vector
credit_card_data, credit_card_new_col = categorical_to_onehot(credit_card_data, True)

In [26]:
# setup aggregations setting on numerical and categorical feature
numerical_agg = {
    'MONTHS_BALANCE': ['min', 'max', 'mean'],
    'AMT_BALANCE': ['min', 'max', 'mean'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['min', 'max', 'mean'],
    'AMT_DRAWINGS_ATM_CURRENT': ['min', 'max', 'mean'],
    'AMT_DRAWINGS_CURRENT': ['min', 'max', 'mean'],
    'AMT_DRAWINGS_OTHER_CURRENT': ['min', 'max', 'mean'],
    'AMT_INST_MIN_REGULARITY': ['min', 'mean'],
    'AMT_PAYMENT_CURRENT': ['mean'],
    'AMT_PAYMENT_TOTAL_CURRENT':['mean', 'sum'],
    'AMT_RECEIVABLE_PRINCIPAL':['mean'],
    'AMT_RECIVABLE':['mean'],
    'AMT_TOTAL_RECEIVABLE':['mean'],
    'CNT_DRAWINGS_ATM_CURRENT':['min', 'max', 'mean'],
    'CNT_DRAWINGS_CURRENT':['min', 'max', 'mean'],
    'CNT_DRAWINGS_OTHER_CURRENT':['min', 'max', 'mean']
}

categorical_agg = {}
for col in credit_card_new_col: 
    categorical_agg[col] = ['mean']
    
aggregations = {**numerical_agg, **categorical_agg}

In [27]:
# start aggregations
credit_card_agg_result = groupby_rename(credit_card_data, 'SK_ID_CURR', aggregations, rename_prefix='CREDIT')

In [28]:
data = data.merge(credit_card_agg_result, how='left', on='SK_ID_CURR')

# installments_payments.csv

In [29]:
install_pay_data = pd.read_csv('Data/installments_payments.csv')

In [30]:
# Transform categorical_feature into one hot vector
install_pay_data, install_pay_new_col = categorical_to_onehot(install_pay_data, True)

In [31]:
# Feature engineering
install_pay_data['PAY_DAY_DIFF'] = install_pay_data['DAYS_ENTRY_PAYMENT'] - install_pay_data['DAYS_INSTALMENT']
install_pay_data['PAY_DAY_DIFF'] = install_pay_data['PAY_DAY_DIFF'].apply(lambda x: 1 if x >= 0 else 0) #transform into binary: early or on time pay; 1 late pay

install_pay_data['AMT_DIFF'] = install_pay_data['AMT_INSTALMENT'] - install_pay_data['AMT_PAYMENT']

In [32]:
# setup aggregations setting on numerical and categorical feature
numerical_agg = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'PAY_DAY_DIFF': ['mean', 'sum'],
    'AMT_DIFF': ['mean', 'sum'],
    'AMT_INSTALMENT': ['max', 'min'],
    'AMT_PAYMENT': ['mean'], 
}

categorical_agg = {}
for col in install_pay_new_col: 
    categorical_agg[col] = ['mean']

aggregations = {**numerical_agg, **categorical_agg}

In [33]:
install_pay_agg_result = groupby_rename(install_pay_data, 'SK_ID_CURR', aggregations, rename_prefix='INSTALL')

In [34]:
data = data.merge(install_pay_agg_result, how='left', on='SK_ID_CURR')

# POS_CASH_balance.csv

In [35]:
pos_data = pd.read_csv('Data/POS_CASH_balance.csv')

In [36]:
pos_data.dtypes

SK_ID_PREV                 int64
SK_ID_CURR                 int64
MONTHS_BALANCE             int64
CNT_INSTALMENT           float64
CNT_INSTALMENT_FUTURE    float64
NAME_CONTRACT_STATUS      object
SK_DPD                     int64
SK_DPD_DEF                 int64
dtype: object

In [37]:
# Transform categorical_feature into one hot vector
pos_data, pos_new_col = categorical_to_onehot(pos_data, True)

In [38]:
# setup aggregations setting on numerical and categorical feature
numerical_agg = {
    'MONTHS_BALANCE': ['min', 'max', 'mean'],
    'CNT_INSTALMENT': ['mean', 'sum'],
    'CNT_INSTALMENT_FUTURE': ['mean', 'sum'] 
}

categorical_agg = {}
for col in pos_new_col: 
    categorical_agg[col] = ['mean']

aggregations = {**numerical_agg, **categorical_agg}

In [39]:
pos_agg_result = groupby_rename(pos_data, 'SK_ID_CURR', aggregations, rename_prefix='POS')

In [40]:
data = data.merge(pos_agg_result, how='left', on='SK_ID_CURR')

## Model Construction

In [41]:
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [42]:
# Separate pre-processed data into train and test set
final_train_data = data[data['TARGET'].notnull()]
final_test_data = data[data['TARGET'].isnull()]

In [43]:
# setup model setting
fold_num = 5
seed = 501
kf = StratifiedKFold(n_splits= fold_num, shuffle=True, random_state=seed)
sample = final_train_data.drop(['TARGET','SK_ID_CURR'], axis=1)
label = final_train_data['TARGET']
result = np.zeros(final_train_data.shape[0])

test_sample = final_test_data.drop(['TARGET','SK_ID_CURR'], axis=1)
test_result = np.zeros(final_test_data.shape[0])

In [44]:
for fold, (train_set, val_set) in enumerate(kf.split(sample,label)):
    train_x, train_y = sample.iloc[train_set], label.iloc[train_set]
    val_x, val_y = sample.iloc[val_set], label.iloc[val_set]
    
    clf = LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.03,
        num_leaves=35,
        subsample=0.6,
        max_depth=10,
        reg_alpha=0.04,
        min_split_gain = 0.02,
        )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
        eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)

    result[val_set] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    test_result += clf.predict_proba(test_sample, num_iteration=clf.best_iteration_)[:, 1] / fold_num
    
    print('Fold: {} AUC: {}'.format(fold + 1, roc_auc_score(val_y, result[val_set])))
    

print('Full AUC score: {}'.format( roc_auc_score(label, result)))

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.785509	training's binary_logloss: 0.239174	valid_1's auc: 0.77156	valid_1's binary_logloss: 0.243319
[200]	training's auc: 0.812919	training's binary_logloss: 0.228797	valid_1's auc: 0.78276	valid_1's binary_logloss: 0.238677
[300]	training's auc: 0.833193	training's binary_logloss: 0.221321	valid_1's auc: 0.786708	valid_1's binary_logloss: 0.237055
[400]	training's auc: 0.850005	training's binary_logloss: 0.21509	valid_1's auc: 0.788324	valid_1's binary_logloss: 0.236343
[500]	training's auc: 0.864153	training's binary_logloss: 0.20966	valid_1's auc: 0.788906	valid_1's binary_logloss: 0.236081
[600]	training's auc: 0.876493	training's binary_logloss: 0.204702	valid_1's auc: 0.789155	valid_1's binary_logloss: 0.235956
[700]	training's auc: 0.887241	training's binary_logloss: 0.200063	valid_1's auc: 0.789335	valid_1's binary_logloss: 0.235851
[800]	training's auc: 0.896427	training's binary_logloss: 0.

In [45]:
submit_result =  final_test_data[['SK_ID_CURR', 'TARGET']]
submit_result['TARGET'] = test_result
submit_result.to_csv('submit_result.csv', index= False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## other model

In [57]:
data['AGE_GROUP'].dtypes

CategoricalDtype(categories=[1, 2, 3, 4], ordered=True)

In [54]:
result = np.zeros(final_train_data.shape[0])
test_result = np.zeros(final_test_data.shape[0])

In [58]:
for fold, (train_set, val_set) in enumerate(kf.split(sample,label)):
    train_x, train_y = sample.iloc[train_set], label.iloc[train_set]
    val_x, val_y = sample.iloc[val_set], label.iloc[val_set]
   
    xgboostModel = XGBClassifier(n_estimators=250, learning_rate= 0.3, tree_method='gpu_hist', enable_categorical=True)
    xgboostModel.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
        eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)
    

    result[val_set] = xgboostModel.predict_proba(val_x)[:, 1]
    test_result += xgboostModel.predict_proba(test_sample)[:, 1] / fold_num

    print('Fold: {} AUC: {}'.format(fold + 1, roc_auc_score(val_y, result[val_set])))
    

print('Full AUC score: {}'.format( roc_auc_score(label, result)))



[0]	validation_0-auc:0.73018	validation_1-auc:0.72972
[100]	validation_0-auc:0.89845	validation_1-auc:0.77721
[143]	validation_0-auc:0.92500	validation_1-auc:0.77289
Fold: 1 AUC: 0.7796735255638543




[0]	validation_0-auc:0.73163	validation_1-auc:0.72139
[100]	validation_0-auc:0.89778	validation_1-auc:0.76619
[153]	validation_0-auc:0.92693	validation_1-auc:0.76273
Fold: 2 AUC: 0.7706395393005296




[0]	validation_0-auc:0.73171	validation_1-auc:0.72209
[100]	validation_0-auc:0.89839	validation_1-auc:0.77024
[156]	validation_0-auc:0.92993	validation_1-auc:0.76642
Fold: 3 AUC: 0.7723904924680774




[0]	validation_0-auc:0.73119	validation_1-auc:0.72739
[100]	validation_0-auc:0.90314	validation_1-auc:0.77331
[169]	validation_0-auc:0.93719	validation_1-auc:0.76662
Fold: 4 AUC: 0.7741980263428833




[0]	validation_0-auc:0.73477	validation_1-auc:0.71710
[100]	validation_0-auc:0.89775	validation_1-auc:0.76602
[159]	validation_0-auc:0.93205	validation_1-auc:0.76275
Fold: 5 AUC: 0.7678535691541655
Full AUC score: 0.7728949886026604


In [46]:
print('bureau_agg_result')
test_corr(label_curr,bureau_agg_result)
print('previous_agg_result')
test_corr(label_curr,previous_agg_result)
print('install_pay_agg_result')
test_corr(label_curr,install_pay_agg_result)
print('credit_card_agg_result')
test_corr(label_curr,credit_card_agg_result)
print('pos_agg_result')
test_corr(label_curr,pos_agg_result)

bureau_agg_result
TARGET                                                            1.000000
BUREAU_DAYS_CREDIT_MEAN                                           0.089731
BUREAU_CREDIT_ACTIVE_Active_MEAN                                  0.077356
BUREAU_DAYS_CREDIT_MIN                                            0.075248
BUREAU_MONTHS_BALANCE_MIN_MIN                                     0.073225
BUREAU_DAYS_CREDIT_UPDATE_MEAN                                    0.068929
ACTIVE_DAYS_CREDIT_MEAN                                           0.064041
CLOSED_MONTHS_BALANCE_MIN_MIN                                     0.061319
CLOSED_DAYS_CREDIT_MIN                                            0.061194
BUREAU_STATUS_1_MEAN_MEAN                                         0.061183
ACTIVE_DAYS_CREDIT_MAX                                            0.060414
CLOSED_DAYS_CREDIT_MEAN                                           0.058491
BUREAU_DAYS_CREDIT_MAX                                            0.049785
BUREAU_