In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb

from geopy.distance import geodesic 

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
target_train = df_train['y'].values
id_test = df_test['id'].values



In [3]:
df_train["var7"], _ = pd.factorize(pd.cut(df_train["var7"], 4))
df_test["var7"], _ =  pd.factorize(pd.cut(df_test["var7"], 4))
df_train["var9"], _ = pd.factorize(pd.cut(df_train["var9"], 4))
df_test["var9"], _ =  pd.factorize(pd.cut(df_test["var9"], 4))

In [4]:
dummies = 20

for col in df_test.columns: # 100
    if (len(df_test[col].unique()) < dummies) and (len(df_test[col].unique()) == len(df_train[col].unique())):
        df_train = pd.get_dummies(df_train, columns=[col])
        df_test = pd.get_dummies(df_test, columns=[col])

In [5]:
train = np.array(df_train.drop(['y','id'], axis = 1))
test = np.array(df_test.drop(['id'], axis = 1))

xgb_preds = []

In [6]:
K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)

In [7]:
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
    return 'f1',f1_score(t,y_bin)
err = 0
err2 = 0
err3 = 0
err4 = 0
err5 = 0
errLL = 0
cutPoint = 0.36
for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
    #xgb_params = {'eta': 0.11, 'max_depth': 5, 'subsample': 0.4, 'colsample_bytree': 0.4, 'objective': 'binary:logistic', 'min_child_weight': 15, 'eval_metric': 'auc', 'seed': 99, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, maximize=True, verbose_eval=50, early_stopping_rounds=100)
                        
    xgb_pred = model.predict(d_test)
    pG = model.predict(d_valid)

    errLL += log_loss(valid_y, pG)
    p = pG
    p[p > cutPoint] = 1
    p[p != 1] = 0
    err += f1_score(valid_y, p)
    p = pG
    p[p > (cutPoint+0.01)] = 1
    p[p != 1] = 0
    err2 += f1_score(valid_y, p)
    p = pG
    p[p > (cutPoint+0.02)] = 1
    p[p != 1] = 0
    err3 += f1_score(valid_y, p)
    p = pG
    p[p > (cutPoint-0.02)] = 1
    p[p != 1] = 0
    err4 += f1_score(valid_y, p)
    p = pG
    p[p > (cutPoint-0.01)] = 1
    p[p != 1] = 0
    err5 += f1_score(valid_y, p)
    print(err)
    print(model.get_score(importance_type='gain'))
    xgb_preds.append(list(xgb_pred))

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.82521	valid-auc:0.80470
[50]	train-auc:0.88170	valid-auc:0.85571
[100]	train-auc:0.88689	valid-auc:0.86199
[150]	train-auc:0.89592	valid-auc:0.87034
[200]	train-auc:0.90638	valid-auc:0.87639
[250]	train-auc:0.91473	valid-auc:0.88087
[300]	train-auc:0.92187	valid-auc:0.88308
[350]	train-auc:0.92762	valid-auc:0.88521
[400]	train-auc:0.93322	valid-auc:0.88621
[450]	train-auc:0.93788	valid-auc:0.88727
[500]	train-auc:0.94181	valid-auc:0.88828
[550]	train-auc:0.94557	valid-auc:0.88893
[600]	train-auc:0.94920	valid-auc:0.88937
[650]	train-auc:0.95228	valid-auc:0.88990
[700]	train-auc:0.95512	valid-auc:0.89018
[750]	train-auc:0.95791	valid-auc:0.89025
[800]	train-auc:0.96039	valid-auc:0.89058
[850]

KeyboardInterrupt: 

In [10]:
df_train # apriori, analise de fator
df_train # apriori, analise de fator

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var10,...,var51_0,var51_1,var53_0,var53_1,var53_2,var53_3,var54_0,var54_1,var54_2,var54_3
0,1,18,19,2853,29442,1386,2435,35,-999,63,...,1,0,0,1,0,0,0,1,0,0
1,8,4,110,1986,13684,7189,-999,-999,17,63,...,1,0,0,1,0,0,0,1,0,0
2,30,0,39,1019,10232,678,791,16,-999,63,...,1,0,0,1,0,0,0,1,0,0
3,43,20,39,1751,2689,8235,1042,13,10,14,...,1,0,0,1,0,0,0,1,0,0
4,46,7,44,2262,29428,6031,304,16,-999,63,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14118,35295,4,39,2511,28766,1109,2094,31,24,-999,...,1,0,0,0,1,0,0,0,1,0
14119,35296,19,129,1114,-999,6376,-999,-999,27,-999,...,1,0,1,0,0,0,1,0,0,0
14120,35301,27,44,1786,23761,9048,623,35,27,14,...,1,0,0,0,1,0,0,0,1,0
14121,35304,4,89,210,19593,3634,2453,35,27,63,...,0,1,0,0,1,0,0,0,1,0


In [9]:
print(err/5) #print(err/4) #6711266
print(err2/5) #print(err/4) #6711266
print(err3/5) #print(err/4) #6711266
print(err4/5) #print(err/4) #6711266
print(err5/5) #print(err/4) #6711266
print(errLL/5) #print(err/4) #6711266

0.6711176137096642
0.6711176137096642
0.6711176137096642
0.6711176137096642
0.6711176137096642
0.3075985331784782


In [7]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

output = pd.DataFrame({'id': id_test, 'predicted': preds})

output.to_csv("../data/output/proba/{}-foldCV_avg_sub_dummy_dist_cutpoint{}_error{}_logloss{}.csv".format(K,cutPoint, err/5, errLL/5), index=False)   
output['predicted'][output['predicted'] > cutPoint] = 1
output['predicted'][output['predicted'] != 1] = 0
output['predicted'] = output['predicted'].astype(int)
output.to_csv("../data/output/{}-foldCV_avg_sub_dummy_dist_cutpoint{}_error{}_logloss{}.csv".format(K,cutPoint, err/5, errLL/5), index=False)   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['predicted'][output['predicted'] > cutPoint] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['predicted'][output['predicted'] != 1] = 0


In [99]:
sub = pd.read_csv("./5-foldCV_avg_sub_36_dummy.csv")

In [100]:
sub["v"] = output["predicted"]

In [101]:
sub["v2"] = sub["v"] - sub["predicted"]

In [102]:
sub[sub["v2"] != 0]

Unnamed: 0,id,predicted,v,v2


In [84]:
119

152