In [17]:
MAX_ROUNDS = 650
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.05

In [18]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from numba import jit

In [19]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [20]:
# Read data
train_df = pd.read_csv('D:/Driver/ohe_train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('D:/Driver/ohe_test.csv', na_values="-1")

In [21]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values

train_df = train_df.fillna(999)
test_df = test_df.fillna(999)

col_to_drop = train_df.columns[train_df.columns.str.startswith('ps_calc_')]
train_df = train_df.drop(col_to_drop, axis=1)  
test_df = test_df.drop(col_to_drop, axis=1)  

for c in train_df.select_dtypes(include=['float64']).columns:
    train_df[c]=train_df[c].astype(np.float32)
    test_df[c]=test_df[c].astype(np.float32)
for c in train_df.select_dtypes(include=['int64']).columns[2:]:
    train_df[c]=train_df[c].astype(np.int8)
    test_df[c]=test_df[c].astype(np.int8)
    
y = train_df['target']
X = train_df.drop(['target', 'id'], axis=1)
y_valid_pred = 0*y
X_test = test_df.drop(['id'], axis=1)
y_test_pred = 0

In [22]:
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [23]:
model = CatBoostClassifier(
    learning_rate=LEARNING_RATE, 
    depth=6, 
    l2_leaf_reg = 8, 
    iterations = MAX_ROUNDS,
#    verbose = True,
    loss_function='Logloss'
)

In [24]:
# Run CV

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[test_index,:]
    print( "\nFold ", i)
    
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        fit_model = model.fit( X_train, y_train, 
                               eval_set=[X_valid, y_valid],
                               use_best_model=True
                             )
        print( "  N trees = ", model.tree_count_ )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

('\nFold ', 0)
('  Gini = ', 1)
('\nFold ', 1)
('  Gini = ', 1)
('\nFold ', 2)
('  Gini = ', 1)
('\nFold ', 3)
('  Gini = ', 1)
('\nFold ', 4)
('  Gini = ', 1)

Gini for full training set:


1L

In [25]:
# Save validation predictions for stacking/ensembling
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('ohe_cat_valid.csv', float_format='%.6f', index=False)

In [26]:
# Create submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('ohe_cat_submit.csv', float_format='%.6f', index=False)

In [27]:
sub.head()

Unnamed: 0,id,target
0,0,0.047673
1,1,0.073973
2,2,0.068869
3,3,0.038522
4,4,0.064724


In [11]:
sub.head()

Unnamed: 0,id,target
0,0,0.025863
1,1,0.027111
2,2,0.02426
3,3,0.014383
4,4,0.034117


In [12]:
avg = pd.read_csv('avg.csv')
avg['cat'] = sub['target']
avg.head()

Unnamed: 0,id,target,xgb,lgb,0.75xgb+0.25lgb,0.25xgb+0.75lgb,0.5xgb+0.5lgb,cat
0,0,0,0.027262,0.027969,0.027439,0.027792,0.027615,0.025863
1,1,0,0.023622,0.025589,0.024114,0.025097,0.024605,0.027111
2,2,0,0.0236,0.025659,0.024115,0.025145,0.02463,0.02426
3,3,0,0.015024,0.015382,0.015114,0.015292,0.015203,0.014383
4,4,0,0.037317,0.036267,0.037055,0.036529,0.036792,0.034117


In [13]:
avg['xgb+lgb+cat'] = (avg['lgb']+avg['xgb']+avg['cat'])/3

In [14]:
avg.head()

Unnamed: 0,id,target,xgb,lgb,0.75xgb+0.25lgb,0.25xgb+0.75lgb,0.5xgb+0.5lgb,cat,xgb+lgb+cat
0,0,0,0.027262,0.027969,0.027439,0.027792,0.027615,0.025863,0.027031
1,1,0,0.023622,0.025589,0.024114,0.025097,0.024605,0.027111,0.02544
2,2,0,0.0236,0.025659,0.024115,0.025145,0.02463,0.02426,0.024507
3,3,0,0.015024,0.015382,0.015114,0.015292,0.015203,0.014383,0.01493
4,4,0,0.037317,0.036267,0.037055,0.036529,0.036792,0.034117,0.0359


In [15]:
res = avg[['id', 'xgb+lgb+cat']]
res.columns = ['id', 'target']
res.to_csv('xgb_lgb_cat.csv',index=False)

In [16]:
res.head()

Unnamed: 0,id,target
0,0,0.027031
1,1,0.02544
2,2,0.024507
3,3,0.01493
4,4,0.0359
