In [1]:
import lightgbm as lgbm
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from scipy import stats
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn import metrics
# from fancyimpute import *

from utils import *
%matplotlib inline

dtype = load_obj('dict_dtype')
my_dict = load_obj('my_dict')

### Category

In [2]:
train = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_a.csv",parse_dates=['date'], dtype = dtype)

train = process_dates(train, 'date')
test = process_dates(test, 'date')

In [3]:
train_cate = pd.read_csv("obj/id_cate_train.csv")
test_cate = pd.read_csv("obj/id_cate_test.csv")

In [11]:
train = train.merge(train_cate, how='inner', on='id')
train.head()

Unnamed: 0,id,label,date,f1,f2,f3,f4,f5,f6,f7,...,f289,f290,f291,f292,f293,f294,f295,f296,f297,cate
0,f10eb20f31cf7063ee8bdbd1272214e4d7e0193c8dbce4...,0,2017-11-03,0,0,0,0,100807.0,0,5,...,312.0,328.0,85.0,302.0,201.0,203.0,203.0,61.0,201.0,0
1,d861929b67938d06538b910b9f6b85f5eb62b6ad7361ba...,0,2017-09-17,0,1,1,1,100805.0,1,5,...,324.0,391.0,13.0,302.0,160.0,160.0,161.0,8.0,160.0,0
2,1270cb8a85eedd57672b2c6297fa5633e36773a2c3a351...,0,2017-10-22,0,0,1,0,100102.0,0,6,...,,,,,,,,,,1
3,9fa009724ee7ff9d688ae321304fbc78f608cdabbfdd2b...,0,2017-10-29,0,0,0,1,100807.0,1,4,...,322.0,341.0,57.0,251.0,175.0,176.0,176.0,49.0,150.0,2
4,1da482485d7e8bcefae7e9d0d1167cec3ac111cfa71d8b...,0,2017-10-02,1,1,0,1,100805.0,1,5,...,301.0,301.0,74.0,302.0,182.0,181.0,182.0,51.0,181.0,0


In [14]:
train[train['label']==1].cate.unique().shape

(21,)

In [21]:
train[train['label']==1].groupby('cate').id.count()

cate
0     5152
1      173
2      443
3       35
4       13
5      596
6        1
7     5047
8      108
9      236
10      15
11     214
12       1
13      36
14      17
15       5
16       6
21       1
23       1
24       1
26      21
Name: id, dtype: int64

In [16]:
train[train['label']==0].cate.unique().shape

(34,)

In [23]:
train[train['label']==0].groupby('cate').id.count()

cate
0     568661
1      27374
2      51540
3      60495
4       2825
5      46956
6       4377
7     124926
8      64149
9      13722
10      2483
11      2976
12       214
13      1129
14      3770
15       195
16       774
17       240
18       188
19        38
20       551
21        25
22         8
23       128
24        83
25         3
26        23
27        15
28         6
29         3
30         2
31         2
32         2
33         1
Name: id, dtype: int64

In [18]:
test_cate.cate.unique().shape

(28,)

In [None]:
groups = [[0, 4, 9, 12, 13, 32, 33], [1, 5], [18, 31], [3, 8, 15, 17, 19, 20, 29, 32],\
 [16, 22, 30], [7, 26], [11, 21, 23, 24, 25, 27],[2, 6, 10, 14, 28]]

In [24]:
test_cate.groupby('cate').id.count()

cate
0     136719
1      10667
2     166345
3      15405
4        973
5      26572
6      10208
7      81925
8      18847
9       5133
10      6652
11      1027
12       167
13       637
14      7496
15        85
16      1671
17       113
18       664
19         3
20       102
22        11
23        22
24         4
26        38
27        14
28       167
33         1
Name: id, dtype: int64

### Cross validation

In [2]:
train = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_a.csv",parse_dates=['date'], dtype = dtype)

In [3]:
NFOLDS = 5
kfold = SKF(n_splits=NFOLDS, shuffle=True, random_state=666)

In [4]:
train = train[train['label']!=-1]
train.sort_values('date', inplace=True)
#train_label = data['label']
train_id = train['id']
test_id = test['id']
y = train['label'].values
X = train.drop(['id','date','label'], axis=1).values
X_test = test.drop(['id','date'], axis=1).values

In [5]:
print(X.shape, y.shape, X_test.shape)

(990006, 297) (990006,) (491668, 297)


In [6]:
cv_only = True
num_round = 2

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [8]:
x_score = []
final_cv_train = np.zeros(len(y))
final_cv_pred = np.zeros(len(test_id))

for s in range(num_round):
    cv_train = np.zeros(len(train_id))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, y)

        best_trees = []
        fold_scores = []

        for i, (train_idx, val_idx) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                X[train_idx,:], X[val_idx,:], y[train_idx], y[val_idx]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=my_score2, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[val_idx] += bst.predict(X_validate)

            score = my_score1(label_validate, cv_train[val_idx])
            # print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:", my_score1(y, cv_train))
        print("Average score so far:", my_score1(y, final_cv_train / (s + 1.)), "Round:", s+1)
        print(fold_scores)
        print("Best trees:", best_trees, "There average:", np.mean(best_trees))

        x_score.append(my_score1(y, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0244259	valid_0's score: 0.558062
[200]	valid_0's binary_logloss: 0.0227919	valid_0's score: 0.590474
[300]	valid_0's binary_logloss: 0.0221914	valid_0's score: 0.609485
[400]	valid_0's binary_logloss: 0.0218556	valid_0's score: 0.614763
[500]	valid_0's binary_logloss: 0.0215348	valid_0's score: 0.61934
[600]	valid_0's binary_logloss: 0.0213655	valid_0's score: 0.622722
[700]	valid_0's binary_logloss: 0.0212159	valid_0's score: 0.628247
[800]	valid_0's binary_logloss: 0.0210865	valid_0's score: 0.633691
[900]	valid_0's binary_logloss: 0.0209925	valid_0's score: 0.637113
[1000]	valid_0's binary_logloss: 0.0208943	valid_0's score: 0.636907
Early stopping, best iteration is:
[976]	valid_0's binary_logloss: 0.0209257	valid_0's score: 0.638722
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0245212	valid_0's score: 0.555794
[200]	valid_0's binary_

KeyboardInterrupt: 

In [11]:
pd.DataFrame({'id': test_id, 'score': final_cv_pred / float(num_round)}).to_csv('submission/lgbm_test_0001.csv', index=False)
# pd.DataFrame({'id': train_id, 'score': final_cv_train / float(num_round)}).to_csv('submission/lgbm_train_0001.csv', index=False)

### Cross validation with partial data

In [2]:
train = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_a.csv",parse_dates=['date'], dtype = dtype)

train = process_dates(train, 'date')
test = process_dates(test, 'date')

In [3]:
train = train[train['label']!=-1]
weights = load_obj('weights')
weights = weights.drop('label', axis=1)
train = train.merge(weights, how='inner', on='id')
weights = train.weight

In [4]:
train = train.reset_index(drop=True).iloc[weights.values.argsort()[weights.shape[0]//2:],:]

In [5]:
weights = train.weight

In [6]:
NFOLDS = 5
kfold = SKF(n_splits=NFOLDS)#, shuffle=False, random_state=666)

In [7]:
train_id = train['id']
test_id = test['id']
y = train['label'].values
X = train.drop(['id','date','label','weight'], axis=1).values
X_test = test.drop(['id','date'], axis=1).values

In [8]:
print(X.shape, y.shape, X_test.shape)

(495003, 301) (495003,) (491668, 301)


In [11]:
cv_only = True
num_round = 1

learning_rate = 0.1
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [12]:
x_score = []
final_cv_train = np.zeros(len(y))
final_cv_pred = np.zeros(len(test_id))

for s in range(num_round):
    cv_train = np.zeros(len(train_id))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, y)

        best_trees = []
        fold_scores = []

        for i, (train_idx, val_idx) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                X[train_idx,:], X[val_idx,:], y[train_idx], y[val_idx]
            train_weights = weights.values[train_idx]
            valid_weights = weights.values[val_idx]
            dtrain = lgbm.Dataset(X_train, label_train, weight=train_weights)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain, weight=valid_weights)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=my_score2, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[val_idx] += bst.predict(X_validate)

            score = my_score1(label_validate, cv_train[val_idx])
            # print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:", my_score1(y, cv_train))
        print("Average score so far:", my_score1(y, final_cv_train / (s + 1.)), "Round:", s+1)
        print(fold_scores)
        print("Best trees:", best_trees, "There average:", np.mean(best_trees))

        x_score.append(my_score1(y, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.030281	valid_0's score: 0.599034
[200]	valid_0's binary_logloss: 0.0283135	valid_0's score: 0.624082
[300]	valid_0's binary_logloss: 0.0277794	valid_0's score: 0.623632
[400]	valid_0's binary_logloss: 0.0273515	valid_0's score: 0.639279
[500]	valid_0's binary_logloss: 0.0272308	valid_0's score: 0.639601
[600]	valid_0's binary_logloss: 0.0271221	valid_0's score: 0.640567
Early stopping, best iteration is:
[521]	valid_0's binary_logloss: 0.0272182	valid_0's score: 0.644108
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0334041	valid_0's score: 0.541146
[200]	valid_0's binary_logloss: 0.0308192	valid_0's score: 0.569414
[300]	valid_0's binary_logloss: 0.0298392	valid_0's score: 0.586542
[400]	valid_0's binary_logloss: 0.0292516	valid_0's score: 0.592981
[500]	valid_0's binary_logloss: 0.0289462	valid_0's score: 0.607147
Early stopping, best ite

In [13]:
pd.DataFrame({'id': test_id, 'score': final_cv_pred / float(num_round)}).to_csv('submission/lgbm_test_0002.csv', index=False)
# pd.DataFrame({'id': train_id, 'score': final_cv_train / float(num_round)}).to_csv('submission/lgbm_train_0001.csv', index=False)

In [19]:
?lgbm.Dataset