In [1]:
import pandas as pd
import numpy as np
import os
from xgboost import XGBClassifier, cv, DMatrix 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [2]:
data_path = "."

In [3]:
train = pd.read_csv(os.path.join(data_path, "train_sample.csv"))  
test = pd.read_csv(os.path.join(data_path, "test_sample.csv"))


In [4]:
train

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,class
0,0.851929,0.087019,0.811998,0.075511,0.002587,0.721975,0.518255,0
1,0.160300,0.917404,0.979958,0.991313,0.735917,0.548838,0.807111,1
2,0.987793,0.301259,0.748637,0.697606,0.150468,0.290844,0.289762,0
3,0.815916,0.446977,0.126875,0.807137,0.118084,0.549412,0.904486,0
4,0.914623,0.844809,0.334767,0.591941,0.302508,0.769384,0.086291,0
...,...,...,...,...,...,...,...,...
495,0.712277,0.209379,0.629417,0.708424,0.282171,0.705648,0.535430,0
496,0.619331,0.714697,0.281979,0.051874,0.881958,0.937741,0.883560,1
497,0.697910,0.547412,0.690421,0.658377,0.071095,0.692943,0.773117,0
498,0.892910,0.028138,0.952461,0.256364,0.264237,0.258471,0.316102,0


In [5]:
test

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,0.950009,0.217793,0.050342,0.607236,0.124179,0.982474,0.001162
1,0.701074,0.532975,0.211688,0.760293,0.941510,0.101267,0.389918
2,0.472061,0.032279,0.896741,0.720505,0.635669,0.136306,0.958429
3,0.859937,0.723871,0.667949,0.422503,0.055984,0.400132,0.519728
4,0.240543,0.953446,0.141465,0.383435,0.555707,0.737015,0.977681
...,...,...,...,...,...,...,...
495,0.321121,0.732845,0.153304,0.132940,0.941357,0.927523,0.381182
496,0.171769,0.122194,0.342345,0.404135,0.160246,0.307318,0.406209
497,0.270209,0.860014,0.774145,0.278722,0.565839,0.618619,0.880629
498,0.163362,0.543294,0.419754,0.543645,0.539789,0.103252,0.006019


In [6]:
X = train.iloc[:, :-1]  
y = train['class']  


In [7]:
params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
dtrain = DMatrix(X, label=y) 
cv_results = cv(params, dtrain, num_boost_round=100, nfold=5, seed=0, as_pandas=True)


In [8]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.975202,0.004012,0.913971,0.023565
1,0.984678,0.006251,0.933345,0.030036
2,0.990312,0.003343,0.934128,0.026678
3,0.993200,0.001631,0.936106,0.027287
4,0.994778,0.001704,0.941898,0.020598
...,...,...,...,...
95,1.000000,0.000000,0.948398,0.020226
96,1.000000,0.000000,0.948473,0.020290
97,1.000000,0.000000,0.948394,0.020452
98,1.000000,0.000000,0.948642,0.020503


In [9]:
best_nrounds = cv_results['test-auc-mean'].idxmax() 
best_nrounds


31

In [10]:
model_xgb = XGBClassifier(objective='binary:logistic', n_estimators=best_nrounds, random_state=0)
model_xgb.fit(X, y)

In [11]:
X_test = test
prediction = model_xgb.predict_proba(X_test)[:,1]
prediction


array([0.6981805 , 0.9963875 , 0.989884  , 0.11137533, 0.97391814,
       0.9963643 , 0.99621165, 0.00665019, 0.0109528 , 0.07321531,
       0.9962131 , 0.990575  , 0.14312339, 0.00182987, 0.99164563,
       0.00230474, 0.9865513 , 0.99288356, 0.99876535, 0.7394337 ,
       0.29451787, 0.01685771, 0.02848066, 0.97988313, 0.01596009,
       0.9910869 , 0.99589074, 0.9695369 , 0.9981949 , 0.00785965,
       0.9977836 , 0.01851079, 0.8606671 , 0.01030572, 0.98636276,
       0.03637189, 0.98856   , 0.9971154 , 0.96792495, 0.9971342 ,
       0.9973585 , 0.00387598, 0.00375636, 0.966217  , 0.06569888,
       0.01871832, 0.33027306, 0.95782286, 0.13481998, 0.9961815 ,
       0.03426437, 0.9966815 , 0.9341949 , 0.00714927, 0.9991473 ,
       0.8022879 , 0.9977881 , 0.9948573 , 0.99299484, 0.97541237,
       0.01413733, 0.97777224, 0.9768336 , 0.9978417 , 0.9963617 ,
       0.01182617, 0.9846978 , 0.11193582, 0.99366426, 0.48094895,
       0.9967609 , 0.00609517, 0.06248945, 0.9928041 , 0.98705

In [12]:
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X, y)

In [13]:
importances = rf_model.feature_importances_
rf_most_important = np.argsort(importances)[-1] + 1
rf_most_important

np.int64(5)

In [14]:
np.savez('submission.npz', rf_most_important=rf_most_important, prediction=prediction)


In [15]:
npzfile = np.load('submission.npz')
print(npzfile['rf_most_important'], npzfile['prediction'].shape)

5 (500,)
