<a href="https://www.kaggle.com/code/tunguz/xgb-fe-0?scriptVersionId=162994759" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test.csv


In [2]:
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import gc
import shap
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [4]:
train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')
test = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv')
sample_submission = pd.read_csv('../input/porto-seguro-safe-driver-prediction/sample_submission.csv')

In [5]:
features = train.columns[2:]

In [6]:
features

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [7]:
faulty_columns = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_car_10_cat']

In [8]:
new_features = list(filter(lambda x: x not in faulty_columns, features))

In [9]:
train['ps_ind_03-ps_ind_02_cat'] = train['ps_ind_03']*train['ps_ind_02_cat']
train['ps_car_13-ps_ind_03'] = train['ps_car_13']*train['ps_ind_03']

test['ps_ind_03-ps_ind_02_cat'] = test['ps_ind_03']*test['ps_ind_02_cat']
test['ps_car_13-ps_ind_03'] = test['ps_car_13']*test['ps_ind_03']

new_features += ['ps_ind_03-ps_ind_02_cat', 'ps_car_13-ps_ind_03']

In [10]:
X = train[new_features]
X_test = test[new_features]
Y = train.target.values

In [11]:
params = {'objective': 'binary:logistic',
          'tree_method': 'hist',
          'device': 'cuda',
          'lambda': 4.645511,
 'alpha': 0.654147,
 'colsample_bytree': 0.917,
 'subsample': 0.66,
 'learning_rate': 0.013,
 'max_depth': 7,
 'min_child_weight': 194,
 'eval_metric': 'logloss'}

In [12]:
dtest = xgb.DMatrix(X_test, enable_categorical=True)

In [13]:
%%time
train_oof = np.zeros((X.shape[0], ))
test_preds = 0
train_oof.shape
num_round = 1000

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(X)):
        print("Fitting fold", jj+1)
        train_features = X.loc[train_index]
        train_target = Y[train_index]

        val_features = X.loc[val_index]
        val_target = Y[val_index]

        dtrain = xgb.DMatrix(train_features, train_target, enable_categorical=True)
        dval = xgb.DMatrix(val_features, val_target, enable_categorical=True)

        model = xgb.train(params, dtrain, num_round)
        #model.set_param({'predictor': 'cpu_predictor'})
        val_pred = model.predict(dval)
        train_oof[val_index] = val_pred
        print("Fold normalized:", gini_normalized(val_target, val_pred))
        test_preds += model.predict(dtest)/n_splits
        del train_features, train_target, val_features, val_target
        gc.collect()

Fitting fold 1
Fold normalized: 0.28734930085231536
Fitting fold 2
Fold normalized: 0.2939313711483233
Fitting fold 3
Fold normalized: 0.2756968199008448
Fitting fold 4
Fold normalized: 0.28697904231902255
Fitting fold 5
Fold normalized: 0.27985625480284937
CPU times: user 49.6 s, sys: 1.38 s, total: 51 s
Wall time: 43.3 s


In [14]:
gini_normalized(Y, train_oof)

0.284550421282249

In [15]:
sample_submission['target'] = test_preds
sample_submission.head()

Unnamed: 0,id,target
0,0,0.027276
1,1,0.026245
2,2,0.02387
3,3,0.015366
4,4,0.034977


In [16]:
sample_submission.to_csv('submission.csv', index=False)