### Gradient Boosting with CatBoost - Day 3

In [0]:
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score    
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
import itertools
from numpy import median


In [0]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/data-storm-10/credit_card_default_train.csv
/kaggle/input/data-storm-10/DATA STORM 1.0 - First Round Competition Guidlines.pdf
/kaggle/input/data-storm-10/credit_card_default_test.csv
/kaggle/input/data-storm-10/Credit_card_default - Business Problem - Assessment Criteria - Data Dictionary.xlsx


credit_card_default_train=pd.read_csv('/kaggle/input/data-storm-10/credit_card_default_train.csv')
credit_card_default_test=pd.read_csv('/kaggle/input/data-storm-10/credit_card_default_test.csv')

#### Loading data and initialization

In [0]:
credit_card_default_train=pd.read_csv('/kaggle/input/data-storm-10/credit_card_default_train.csv')
credit_card_default_test=pd.read_csv('/kaggle/input/data-storm-10/credit_card_default_test.csv')

In [0]:
target_names = ['0','1']

def print_report(X_train, X_val, y_train, y_val, model):
    print('Train Report')
    y_pred_train = model.predict(X_train)
    print(classification_report(y_train, y_pred_train, target_names = target_names))

    print('Validation Report')
    y_pred_val = model.predict(X_val)
    print(classification_report(y_val, y_pred_val, target_names = target_names))
    
    print('Training f1-score(weighted): %f' % f1_score(y_train, y_pred_train, average = 'weighted'))
    print('Validation f1-score(weighted): %f' % f1_score(y_val, y_pred_val, average = 'weighted'))

In [0]:
_id = 'Client_ID'
target = 'NEXT_MONTH_DEFAULT'

X = credit_card_default_train.drop(target, axis =1)
y = credit_card_default_train.loc[:, target]

# Save Client_IDs of the tests in order to append at the end
test_client_ids =  credit_card_default_test[_id]

X.drop(_id, axis = 1, inplace = True)
credit_card_default_test.drop(_id, axis = 1, inplace = True)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_copy= X_train.copy()
X_val_copy = X_val.copy()

In [0]:
unused_cols = ['Balance_Limit_V1', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE']
pay_cols = ['PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_NOV','PAY_DEC']
paid_cols = ['PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC']
due_cols = ['DUE_AMT_JULY', 'DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC']
categorical_cols = ['EDU_ENC', 'MARITAL_ENC', 'AGE_ENC', 'Gender']

#### Setting up encoders

In [0]:
def balance_limit_encoder(value):
    last = value[-1]
    value = value[:-1]
    int_val = 0
    if last == 'K':
        int_val = float(value) * 1e3
    elif last == 'M':
        int_val = float(value) * 1e6
    return int_val/1e6
    
marital_status_encoder = LabelEncoder()
edu_status_encoder = LabelEncoder()
age_encoder = LabelEncoder()

def balance_limit_to_num(dataset):
    return dataset.assign(Balance_Limit_Enc = dataset['Balance_Limit_V1'].map(balance_limit_encoder))

def func_age_encode(data):
  if(data == '31-45'):
    return (31+45)/2
  elif data == 'Less than 30':
    return 15
  elif data == '46-65':
    return (65-46)/2
  elif data == 'More than 65':
    return 65
  else:
    return 0

# Fit Encoders
def fit_encoders(fit_data):
    marital_status_encoder.fit(fit_data['MARITAL_STATUS'])
    edu_status_encoder.fit(fit_data['EDUCATION_STATUS'])
    age_encoder.fit(fit_data['AGE'])


In [0]:
scaler = StandardScaler()

#### Feature Generation Functions

In [0]:
def generate_pay_value_frequency_features(dataset):
    '''Generates a column for each of the possible values for PAY_* columns with the frequency of
    each value in  a row'''

    cols = {'PAY_'+str(i) : [] for i in range(-2,10)}

    for index, row in dataset.iterrows():
        counts = {i : 0 for i in range(-2,10)}
      
        for i in pay_cols:
            item = int(row[i])
            counts[item] = counts.get(item, 0) + 1
        for key, value in counts.items():
            cols['PAY_'+str(key)].append(value)
    return pd.DataFrame(cols, index = dataset.index)

def generate_pay_due_diff_features(dataset):
    '''For each month in a row generate a column with the difference of the due and
    paid amount'''
    
    for col in paid_cols:
        dataset[col[9:]] = dataset[col] - dataset['DUE_AMT' + col[8:]]

def generate_last_to_other_months_avg_ratio_feature(dataset):
    '''For each row divide DUE_AMT_DEC by the mean of the DUE_AMOUNT_* of other months'''
    dataset['ltomar'] = dataset['DUE_AMT_DEC'] / dataset[list(set(due_cols)-set(['DUE_AMT_DEC']))].mean(axis =1)
    dataset['ltomar'].replace([np.inf], 1e10, inplace=True)
    dataset['ltomar'].replace([np.nan], -2e10, inplace=True)
    dataset['ltomar'].replace([ -np.inf], -1e10, inplace=True)

def generate_last_to_other_months_ratio_featute(dataset):
    '''For each row divide DUE_AMT_DEC by the values of the DUE_AMOUNT_* columns of other months'''
    for i in range(len(due_cols)-2,-1, -1):
      dataset['due-'+str(i)] = dataset['DUE_AMT_DEC'] / dataset[due_cols[i]]
      dataset['due-'+str(i)].replace([np.inf], 1e10, inplace=True)
      dataset['due-'+str(i)].replace([np.nan], -2e10, inplace=True)
      dataset['due-'+str(i)].replace([ -np.inf], -1e1, inplace=True)
        
interaction_encoders =[]

# Code modified from kaggle course 'Feature Generation'
def generate_interaction_features(dataset, train):
    global interaction_encoders
    
    if (train): interaction_encoders = []
    
    interactions = pd.DataFrame(index = dataset.index)
    i = 0
    for col1, col2 in itertools.combinations(categorical_cols, 2):
        new_col_name = '_'.join([col1, col2])
        new_values = dataset[col1].map(str) + "_" + dataset[col2].map(str)
        
        if (train):
            label_enc = LabelEncoder()
            interaction_encoders.append(label_enc)
            interactions[new_col_name] = label_enc.fit_transform(new_values)
        else:
            label_enc = interaction_encoders[i]
            interactions[new_col_name] = label_enc.transform(new_values)
            i+=1

    dataset = dataset.join(interactions) 
    return dataset

#### Preprocessing Function

In [0]:
# Transform Categorical Data
def preprocess(data, train = False):
    dataset = data.copy()
    
    dataset = balance_limit_to_num(dataset)
    
    dataset['Gender'] = dataset.Gender.map(lambda x: 1 if x== 'M' else 0)
    dataset['AGE_ENC'] = dataset.AGE.map(func_age_encode)
    dataset['EDU_ENC'] = pd.Series(
        edu_status_encoder.transform(dataset['EDUCATION_STATUS']),
        dataset.index
    )
    
    dataset['MARITAL_ENC'] = pd.Series(
        marital_status_encoder.transform(dataset['MARITAL_STATUS']),
        dataset.index
    )
    
    generate_pay_due_diff_features(dataset)
    df = generate_pay_value_frequency_features(dataset)
    dataset = pd.concat([df, dataset], axis = 1)
    
    dataset = generate_interaction_features(dataset, train)
    generate_last_to_other_months_avg_ratio_feature(dataset)
    generate_last_to_other_months_ratio_featute(dataset)
    dataset.drop(unused_cols, axis= 1, inplace = True)
    
    cols = dataset.columns
    index = dataset.index
    if train : scaler.fit(dataset)
    dataset =pd.DataFrame(scaler.transform(dataset), columns = cols, index = index)
    return dataset

In [0]:
fit_encoders(X_train)
X_train_copy = preprocess(X_train, True)
X_val_copy = preprocess(X_val)
X_train_copy.head()

#### Correlation Matrix

In [0]:
corr = X_train_copy.assign(target = pd.DataFrame(y_train)['NEXT_MONTH_DEFAULT']).corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(22, 18))
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

Features selected using models, chi-squared etc. (The code is in another notebook)

In [0]:
features  = ['PAY_JULY',
 'PAY_2',
 'PAID_AMT_JULY',
 'PAID_AMT_AUG',
 'Balance_Limit_Enc',
 'ltomar',
 'PAY_7',
 'PAY_3',
 'PAY_1',
 'PAY_0',
 'PAID_AMT_SEP',
 'PAID_AMT_OCT',
 'OCT']

#### Evaluation method

In [0]:

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [0]:
print_score = lambda score : print('Mean: %f\t Median: %f' % (score.mean(), median(score)))

In [0]:
fit_params = {
    'early_stopping_rounds' : 5,
    'eval_set': [(X_val_copy[features], y_val)]
}

Using cross validation to identify the best classifer for the dataset with new features

In [0]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

scores = cross_val_score(xgb,X_train_copy[features], y_train, cv=5, fit_params=fit_params)
print_score(scores)

In [0]:
from catboost import CatBoostClassifier, Pool

cat = CatBoostClassifier()

scores = cross_val_score(cat,X_train_copy[features], y_train, cv=5, fit_params=fit_params)
print_score(scores)

In [0]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier()

scores = cross_val_score(lgb,X_train_copy[features], y_train, cv=5, fit_params=fit_params)
print_score(scores)


It is clear that CatBoost is slightly better than other two for the selected features

### Final Classifier

In [0]:
model = CatBoostClassifier()

X_train, X_val, y_train, y_val =  train_test_split(X, y, test_size=0.02, random_state = 32)

fit_encoders(X_train)

X_train = preprocess(X_train, True)
X_val = preprocess(X_val)
X_test = preprocess(credit_card_default_test)

model.fit(X_train[features], y_train, early_stopping_rounds=5, eval_set=[(X_val[features], y_val)] )

# Here validation report is not useful because the set is small and because we have used it to detect
# overfitting
print_report(X_train, X_val, y_train, y_val, model)

In [0]:
y_pred = model.predict(X_test)

df = pd.DataFrame({
    'Client_ID' : test_client_ids,
    'NEXT_MONTH_DEFAULT': y_pred
})

In [0]:
df.head()

In [0]:
df.to_csv('predictions.csv', index = False)