In [25]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy import sparse
from sklearn.model_selection import KFold
from sklearn import cross_validation, metrics
from sklearn.metrics import roc_auc_score
import scipy.special as special
from sklearn.externals import joblib
import gc, os
import datetime
import pickle

In [26]:
OPTIMIZE_ROUNDS = True
EARLY_STOPPING_ROUNDS = 30
raw_data_path = '../data/raw_data/'

In [27]:
train_x = sparse.load_npz(raw_data_path + 'train_x_20.npz')
test_x =  sparse.load_npz(raw_data_path + 'test_x_20.npz')
train_y = pd.read_csv('../data/train_y_519.csv', header=None)
test = pd.read_csv('../data/test2.csv')
res=test[['aid','uid']]

In [None]:
train_y.shape, train_x.shape, test_x.shape

((8798814, 1), (8798814, 10044), (2265879, 10044))

In [None]:
y_valid_pred = 0*train_y
y_test_pred = 0

print("XGB test")
model = xgb.XGBClassifier(boosting_type='gbdt', colsample_bytree=0.7,
                         learning_rate=0.05, max_depth=9, metric='auc',
                         gamma=0.3, min_child_weight=32, 
                         n_estimators=1, n_jobs=-1,
                         random_state=2020, reg_alpha=1, reg_lambda=0.4, 
                         subsample=0.9)

K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(2019)

test_x_csr = test_x.tocsr()
train_x_csr = train_x.tocsr()

XGB test


In [21]:
for i, (train_index, test_index) in enumerate(kf.split(train_x)):

    # Create data for this fold
    y_train, y_valid = train_y.iloc[train_index].copy(), train_y.iloc[test_index].copy()
    X_train, X_valid = train_x_csr[train_index.tolist(), :].copy(), train_x_csr[test_index.tolist(), :].copy()
    X_test = test_x_csr.copy()
    print ("\nFold ", i)
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    if OPTIMIZE_ROUNDS:
        eval_set = [(X_valid, y_valid)]
        fit_model = model.fit(X_train, y_train,
                              eval_set=eval_set,
                              eval_metric='auc',
                              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                              verbose=False
                              )
        print ("  Best N trees = ", model.best_ntree_limit)
        print ("  Best AUC = ", model.best_score)
    else:
        fit_model = model.fit(X_train, y_train)

    pred = fit_model.predict_proba(X_valid)[:, 1]
    y_valid_pred.iloc[test_index] = pred.reshape(-1,1)

    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:, 1]

    del X_test, X_train, X_valid, y_train


Fold  0
2018-05-22 10:55:38


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  Best N trees =  1
  Best AUC =  1.0

Fold  1
2018-05-22 10:57:41


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  Best N trees =  1
  Best AUC =  1.0

Fold  2
2018-05-22 10:59:09


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  Best N trees =  1
  Best AUC =  1.0

Fold  3
2018-05-22 11:00:32


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  Best N trees =  1
  Best AUC =  1.0

Fold  4
2018-05-22 11:01:56


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  Best N trees =  1
  Best AUC =  1.0


In [24]:
y_test_pred /= K  # Average test set predictions
res['score'] = y_test_pred
res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
res.to_csv('../data/result/submission_22.csv', index=False)
os.system('zip ../data/result/baseline_xgb_22.zip ../data/result/submission_22.csv')
print( "\nAUC for full training set:" )
print(roc_auc_score(train_y, y_valid_pred))
joblib.dump(model, '../data/model/xgb_submit_22.model')
print('-----------model saved----------')



AUC for full training set:
1.0
-----------model saved----------
