In [1]:
import pandas as pd
import numpy as np
from utils import peek
import joblib
import pickle
import config

from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
import lightgbm as lgbm
import sklearn

from functools import partial
from scipy.optimize import fmin

## LR

In [3]:
test = pd.read_csv(config.TEST)
model = joblib.load(config.BASELINE)
test_score_lr = model.predict_proba(test.drop('order_status_key', axis=1))[:, 1]
roc_auc_score(test['order_status_key'], test_score_lr)

0.7297500469836496

## Random Forest

In [4]:
test = pd.read_csv(config.TEST2)
rf = joblib.load(config.RF)
test_score_rf = rf.predict_proba(test.drop('order_status_key', axis=1))[:, 1]
roc_auc_score(test['order_status_key'], test_score_rf)

0.7362977406084905

## XGBoost

In [66]:
test = pd.read_csv(config.TEST3)
cat_features = ['gender', 'degree', 'card_num']
test[cat_features] = test[cat_features].astype('category')
test_x, test_y = test.drop(['order_status_key'], axis=1), test['order_status_key']
dtest = xgb.DMatrix(test_x, test_y, enable_categorical=True)
bst = xgb.Booster()
bst.load_model(config.XGBoost)
test_score = bst.predict(dtest)
roc_auc_score(test_y, test_score)

0.7354070089164527

## Catboost

In [8]:
test = pd.read_csv(config.TEST3)
cat_features = ['gender', 'degree', 'card_num']
test[cat_features] = test[cat_features].astype('category')
test_x, test_y = test.drop(['order_status_key'], axis=1), test['order_status_key']
pool = Pool(test_x, test_y, cat_features= ['gender', 'degree', 'card_num'])
bst = CatBoostClassifier()
bst.load_model(config.CATBOOST)
test_score = bst.predict_proba(pool)[:, 1]
roc_auc_score(test_y, test_score)

0.732985393305352

## LightGBM 

In [2]:
test = pd.read_csv(config.TEST3)
test_x, test_y = test.drop(['order_status_key'], axis=1), test['order_status_key']
bst = lgbm.Booster(model_file=config.LIGHTGBM)
test_score_lgbm = bst.predict(test_x)
roc_auc_score(test_y, test_score_lgbm)

0.7409008331767213

## Ensemble

In [15]:
ensemble = pd.DataFrame({'label':test['order_status_key'], 'lr':test_score_lr, 'rf':test_score_rf, 'lightgbm':test_score})
ensemble['mean_score'] = ensemble.loc[:,'lr':'lightgbm'].apply(np.mean, axis=1)
roc_auc_score(ensemble['label'], ensemble['mean_score'])

0.7369254943724027

In [20]:
class OptimizeAUC:
    """
    Class for optimizing AUC.
    This class is all you need to find best weights for
    any model and for any metric and for any types of predictions.
    With very small changes, this class can be used for optimization of
    weights in ensemble models of _any_ type of predictions
    """
    def __init__(self):
        self.coef_ = 0
        
    def _auc(self, coef, X, y):
        """
        This functions calulates and returns AUC.
        :param coef: coef list, of the same length as number of models
        :param X: predictions, in this case a 2d array
        :param y: targets, in our case binary 1d array
        """
        # multiply coefficients with every column of the array
        # with predictions.
        # this means: element 1 of coef is multiplied by column 1
        # of the prediction array, element 2 of coef is multiplied
        # by column 2 of the prediction array and so on!
        x_coef = X * coef
        # create predictions by taking row wise sum
        predictions = np.sum(x_coef, axis=1)
        # calculate auc score
        auc_score = roc_auc_score(y, predictions)
        # return negative auc
        return -1.0 * auc_score

    def fit(self, X, y):
        # remember partial from hyperparameter optimization chapter?
        loss_partial = partial(self._auc, X=X, y=y)
        # dirichlet distribution. you can use any distribution you want
        # to initialize the coefficients
        # we want the coefficients to sum to 1
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        # use scipy fmin to minimize the loss function, in our case auc
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)

    def predict(self, X):
        # this is similar to _auc function
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions

In [50]:
optimizer = OptimizeAUC()
optimizer.fit(ensemble.loc[:, "lr":"lightgbm"], ensemble['label'])

Optimization terminated successfully.
         Current function value: -0.737093
         Iterations: 27
         Function evaluations: 63


In [6]:
def binning(series):
    percentiles = list(range(0, 101, 10))
    val = np.percentile(series, percentiles).tolist()
    bins = list(sorted(set([min(val) - 1] + val)))
    series_bin = pd.cut(series, bins, labels=list(range(len(bins) - 1))).astype("int")
    return series_bin

test_bin = test.copy()
test_bin['score'] = test_bin[['score']].apply(binning)

In [7]:
def calc_iv(matrix, feature, target, pr=False):
    df = matrix.copy()
    lst = []
    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    data['Distribution Good'] = (data['Good'] + 1) / data['Good'].sum()
    data['Distribution Bad'] = (data['Bad'] + 1) / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    data['IV'] = (data['Distribution Good'] - data['Distribution Bad']) * data['WoE']

    if pr:
        print(data.sort_values('Value'))
        print('IV = ', round(data['IV'].sum(), 6))

    return round(data['IV'].sum(), 6)

In [8]:
# just for fun by comparing the IV with that of the features IN EDA
calc_iv(test_bin, 'score', 'order_status_key', True)

   Variable  Value  All  Good  Bad  Distribution Good  Distribution Bad  \
10    score      0    1     0    1           0.001634          0.001597   
3     score      1  186     6  180           0.011438          0.144569   
7     score      2  186    19  167           0.032680          0.134185   
0     score      3  186    36  150           0.060458          0.120607   
2     score      4  187    47  140           0.078431          0.112620   
1     score      5  186    55  131           0.091503          0.105431   
9     score      6  186    77  109           0.127451          0.087859   
4     score      7  187    62  125           0.102941          0.100639   
6     score      8  186   100   86           0.165033          0.069489   
5     score      9  186   104   82           0.171569          0.066294   
8     score     10  187   106   81           0.174837          0.065495   

         WoE            IV  
10  0.022618  8.265292e-07  
3  -2.536822  3.377290e-01  
7  -1.412466

0.841872

In [4]:
percentiles = list(range(10, 101, 10))
test['score'] = test_score_lgbm
test['percentile'] = test['score'].rank(pct=True, ascending=False)
result = []
for p in percentiles:
    qualified = test.query(f'percentile < {p / 100}')
    filter_rate = np.round(1 - qualified.shape[0] / test.shape[0], 2)
    recall_rate = np.round(qualified['order_status_key'].sum() /test['order_status_key'].sum(), 2)
    appr_rate = np.round(qualified['order_status_key'].sum() / qualified.shape[0], 2)
    result.append([p, filter_rate, recall_rate, appr_rate])
result = pd.DataFrame(result, columns=['top', 'filter_rate', 'recall_rate', 'approve_rate'])
result

Unnamed: 0,top,filter_rate,recall_rate,approve_rate
0,10,0.9,0.17,0.55
1,20,0.8,0.34,0.56
2,30,0.7,0.5,0.55
3,40,0.6,0.64,0.53
4,50,0.5,0.74,0.49
5,60,0.4,0.84,0.46
6,70,0.3,0.91,0.43
7,80,0.2,0.96,0.4
8,90,0.1,0.99,0.36
9,100,0.0,1.0,0.33
