# Imports

In [141]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from random import sample

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

pd.options.mode.chained_assignment = None

In [142]:
data = pd.read_csv('data/hw1_devsample.csv', sep = ',', encoding = 'utf-8', low_memory=False, index_col='SK_ID_CURR')
print(f'Number of rows:      {data.shape[0]}')
print(f'Number of columns:   {data.shape[1]}')

Number of rows:      80000
Number of columns:   196


In [143]:
data_test = pd.read_csv('data/hw1_outofsample.csv', sep = ',', encoding = 'utf-8', low_memory=False, index_col='SK_ID_CURR')
print(f'Number of rows:      {data_test.shape[0]}')
print(f'Number of columns:   {data_test.shape[1]}')

Number of rows:      20000
Number of columns:   195


# Data Preparation

In [145]:
target = 'TARGET'

technical_columns = [
    'SK_ID_CURR',
    'TARGET',
    'TIME',
    'BASE',
    'DAY',
    'MONTH'
]

predictors = [col for col in data.columns if col not in technical_columns]

## Mean target encoding

In [146]:
def mean_target_encoding(dt, predictor, target, alpha = 0.01):

    total_count = len(dt)
    total_default_rate = np.mean(dt[target])
    
    data_group = dt.groupby(predictor).agg(
        category_default_rate = (target, np.mean),
        category_count = (target, len)
    )
    
    data_group['category_frequency'] = data_group['category_count'] / total_count
    data_group['category_encoding'] = (data_group['category_frequency'] * data_group['category_default_rate'] + alpha * total_default_rate) / (data_group['category_frequency'] + alpha)
    
    vector = dt[predictor]
    encoding = {}
    for value in vector.dropna().unique():
        if value in data_group.index:
            vector = vector.replace(value, data_group.loc[value]["category_encoding"])
            encoding[value] = data_group.loc[value]["category_encoding"]
        else:
            vector = vector.replace(value, total_default_rate)
            encoding[value] = total_default_rate
    
    return encoding

In [147]:
encoding = {}
for col in predictors:
    if data[col].dtype == 'O':
        encoding[col] = mean_target_encoding(data, col, target, alpha = 0.01)

In [148]:
for col, encoder in encoding.items():
    data[col] = data[col].replace(encoder)
    data_test[col] = data_test[col].replace(encoder)

## Missing data and infinity

In [149]:
data[predictors] = data[predictors].fillna(data[predictors].mean())
data_test[predictors] = data_test[predictors].fillna(data_test[predictors].mean())

data = data.replace(np.inf, 99999999)
data_test = data_test.replace(np.inf, 99999999)

In [150]:
# Level Maternity leave was not in dev data, therefore mean target is not computed
data_test.loc[data_test['NAME_INCOME_TYPE']=='Maternity leave', 'NAME_INCOME_TYPE'] = data[target].mean()

# Train & Predict

In [151]:
# We split data into data_dev and data_train. data_dev will be for computation of gini
data_train, data_dev = train_test_split(
    data, test_size=0.15, random_state=42, stratify = data[target]
)

In [160]:
# Declare hyperparameters
random.seed(17)
criterion = "gini"
splitter = "best"
data_fraction = 0.8
predictor_fraction = 0.8
n_trees = 10
max_depth=5

In [161]:
# We compute prediction iteratively, we start with 0
data_train['prediction'] = 0
data_dev['prediction'] = 0
data_test['prediction'] = 0

data_train['sample_weight'] = 1 # sample_weight parameter for training, we start with equal weights
auc_sum = 0 # sum of AUC coef. of all trees (used for convex combination), we can use AUC or GINI with the same results as one is obtained from the other as a linear trans.

for n in range(n_trees):
    
    # Subsample predictors and data (we keep row indices)
    data_iter = np.random.choice(data_train.index, round(data_train.shape[0]*data_fraction), replace=False)
    pred_iter = np.random.choice(predictors, round(len(predictors)*predictor_fraction))
    
    # Train single tree
    model = DecisionTreeClassifier(
        random_state=n,
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
    )
    clf = model.fit(
        data_train.loc[data_iter, pred_iter],
        data_train.loc[data_iter, target],
        sample_weight=data_train.loc[data_iter, 'sample_weight'],
    )
    
    # Compute prediction using single tree
    data_train['prediction_single'] = clf.predict_proba(data_train[pred_iter])[:, 1]
    data_dev['prediction_single'] = clf.predict_proba(data_dev[pred_iter])[:, 1]
    data_test['prediction_single'] = clf.predict_proba(data_test[pred_iter])[:, 1]
    
    # Evaluate AUC
    auc = roc_auc_score(data_dev[target], data_dev['prediction_single'])
    print(f'AUC for tree n. {n+1}: {auc :.4f}')
    
    # Update prediction as a convex combination of all trees
    data_train['prediction'] = (auc_sum*data_train['prediction']+auc*data_train['prediction_single'])/(auc_sum+auc)
    data_dev['prediction'] = (auc_sum*data_dev['prediction']+auc*data_dev['prediction_single'])/(auc_sum+auc)
    data_test['prediction'] = (auc_sum*data_test['prediction']+auc*data_test['prediction_single'])/(auc_sum+auc)
    auc_sum += auc
    
    # Updata sample_weight
    data_train['sample_weight'] = abs(data_train['prediction']-data_train[target])
    
    
# Evaluate AUC for ensemble  
auc_ensemble = roc_auc_score(data_dev[target], data_dev['prediction'])
print(f'AUC for ensemble: {auc_ensemble :.4f}')

AUC for tree n. 1: 0.6986
AUC for tree n. 2: 0.6255
AUC for tree n. 3: 0.6707
AUC for tree n. 4: 0.6922
AUC for tree n. 5: 0.6030
AUC for tree n. 6: 0.6874
AUC for tree n. 7: 0.7025
AUC for tree n. 8: 0.6939
AUC for tree n. 9: 0.6534
AUC for tree n. 10: 0.6983
AUC for ensemble: 0.7357


# Store prediction

In [40]:
data_test[['prediction']].to_csv('test.csv')