In [1]:
import lightgbm as lgbm
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
# import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# import missingno as msno
import pickle
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from scipy import stats
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn import metrics
# from fancyimpute import *

from utils import *
%matplotlib inline

dtype = load_obj('dict_dtype')
my_dict = load_obj('my_dict')

In [2]:
train = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_a.csv",parse_dates=['date'], dtype = dtype)

In [3]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=666)

In [4]:
train = train[train['label']!=-1]
train.sort_values('date', inplace=True)
#train_label = data['label']
train_id = train['id']
test_id = test['id']
y = train['label'].values
X = train.drop(['id','date','label'], axis=1).values
X_test = test.drop(['id','date'], axis=1).values

In [5]:
print(X.shape, y.shape, X_test.shape)

(990006, 297) (990006,) (491668, 297)


In [6]:
cv_only = True
num_round = 2

In [7]:
learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [8]:
x_score = []
final_cv_train = np.zeros(len(y))
final_cv_pred = np.zeros(len(test_id))

for s in range(num_round):
    cv_train = np.zeros(len(train_id))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, y)

        best_trees = []
        fold_scores = []

        for i, (train_idx, val_idx) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                X[train_idx,:], X[val_idx,:], y[train_idx], y[val_idx]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=my_score2, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[val_idx] += bst.predict(X_validate)

            score = my_score1(label_validate, cv_train[val_idx])
            # print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:", my_score1(y, cv_train))
        print("Average score so far:", my_score1(y, final_cv_train / (s + 1.)), "Round:", s+1)
        print(fold_scores)
        print("Best trees:", best_trees, "There average:", np.mean(best_trees))

        x_score.append(my_score1(y, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0244259	valid_0's score: 0.558062
[200]	valid_0's binary_logloss: 0.0227919	valid_0's score: 0.590474
[300]	valid_0's binary_logloss: 0.0221914	valid_0's score: 0.609485
[400]	valid_0's binary_logloss: 0.0218556	valid_0's score: 0.614763
[500]	valid_0's binary_logloss: 0.0215348	valid_0's score: 0.61934
[600]	valid_0's binary_logloss: 0.0213655	valid_0's score: 0.622722
[700]	valid_0's binary_logloss: 0.0212159	valid_0's score: 0.628247
[800]	valid_0's binary_logloss: 0.0210865	valid_0's score: 0.633691
[900]	valid_0's binary_logloss: 0.0209925	valid_0's score: 0.637113
[1000]	valid_0's binary_logloss: 0.0208943	valid_0's score: 0.636907
Early stopping, best iteration is:
[976]	valid_0's binary_logloss: 0.0209257	valid_0's score: 0.638722
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0245212	valid_0's score: 0.555794
[200]	valid_0's binary_

KeyboardInterrupt: 

In [11]:
pd.DataFrame({'id': test_id, 'score': final_cv_pred / float(num_round)}).to_csv('submission/lgbm_test_0001.csv', index=False)
# pd.DataFrame({'id': train_id, 'score': final_cv_train / float(num_round)}).to_csv('submission/lgbm_train_0001.csv', index=False)