In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# import missingno as msno
import pickle
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from scipy import stats
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn import metrics
# from fancyimpute import *

%matplotlib inline

In [2]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

def batch_save(train_x, train_y, valid_x, valid_y, test, postfix):
    train_x.reset_index().to_feather("tmp/train_x_{}".format(postfix))
    train_y.reset_index().to_feather("tmp/train_y_{}".format(postfix))
    valid_x.reset_index().to_feather("tmp/valid_x_{}".format(postfix))
    valid_y.reset_index().to_feather("tmp/valid_y_{}".format(postfix))
    test.reset_index().to_feather("tmp/test_{}".format(postfix))
    
def batch_load(postfix):
    train_x = pd.read_feather("tmp/train_x_{}".format(postfix))
    train_y = pd.read_feather("tmp/train_y_{}".format(postfix))
    valid_x = pd.read_feather("tmp/valid_x_{}".format(postfix))
    valid_y = pd.read_feather("tmp/valid_y_{}".format(postfix))
    return train_x, train_y, valid_x, valid_y

def my_roc(y_true, y_prob):
    if isinstance(y_true,pd.core.series.Series):
        y_true = np.array(y_true.tolist())
    if isinstance(y_true,list):
        y_true = np.array(y_true)
    sort_index = np.argsort(y_prob)[::-1]
    y_prob = y_prob[sort_index]
    y_true = y_true[sort_index]
    num_p = y_true.sum()
    num_n = len(y_true) - num_p
    fp = 0
    tp = 0
    fps = []
    tps = []
    prob_prev = -99
    i = 0
    while i < len(y_true):
        if y_prob[i]!=prob_prev:
            fps.append(fp/num_n)
            tps.append(tp/num_p)
            prob_prev=y_prob[i]
        if y_true[i]==1:
            tp+=1
        else:
            fp+=1
        i+=1
    fps.append(fp/num_n)
    tps.append(tp/num_p)
    return np.array(fps), np.array(tps)

def my_score3(predictions, xtrain): ##Adapted from SKlearn, conservative (actual should be higher)
    ground_truth = xtrain.get_label()
    fpr,tpr = my_roc(ground_truth, predictions)
#     plt.scatter(fpr, tpr)
#     plt.show()
    tpr1 = tpr[(fpr>=0.001).argmax()-1]
    tpr2 = tpr[(fpr>=0.005).argmax()-1] 
    tpr3 = tpr[(fpr>=0.01).argmax()-1]
    return 'score', 0.4 * tpr1 + 0.3 * tpr2 + 0.3* tpr3

def get_ratio(predictions, xtrain):
    ratio_predict = (predictions>0.5).sum()/predictions.shape[0]*100
    # ratio_true = xtrain.get_label().sum()/xtrain.get_label().shape[0]*100
    return 'score', ratio_predict

def norm_standardize(df, start=0):
    for col in df.columns[start:]:
#         avg = df[col].mean()
#         std = df[col].std(ddof=0)
#         if std != 0:
#             df[col] = (df[col]-avg)/std
#         else:
#             print(col)
        a = df[col]
        z = a
        z[~np.isnan(a)] = zscore(a[~np.isnan(a)])
        df[col] = z
            
def norm_maxmin(df, start=0):
    for col in df.columns[start:]:
        df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())

In [3]:
dtype = load_obj('dict_dtype')

my_dict = load_obj('my_dict')

**** Load the training data and test data ****

In [4]:
data = pd.read_csv("atec_anti_fraud_train.csv",parse_dates=['date'], dtype = dtype)
test = pd.read_csv("atec_anti_fraud_test_a.csv",parse_dates=['date'], dtype = dtype)

In [None]:
df_missing_ratio = load_obj('df_missing_ratio')
# display_all(df_missing_ratio)

selected_cols = ['f'+str(item) for item in df_missing_ratio[df_missing_ratio['positive_missing_ratio']<0.1].index.tolist()]
all_nan_cols = ['f'+str(item) for item in df_missing_ratio[df_missing_ratio['positive_missing_ratio']>0.9].index.tolist()]

# # use the columns with no or few missing values
# data = data.drop(all_nan_cols, axis=1)
# test = test.drop(all_nan_cols, axis=1)

**** Perform one hot encoding for the columns with no more than 10 unique values ****

In [None]:
# one hot encoding for the columns with no more than 10 unique values
for col in data.columns[3:]:
    data_unique = data[col].unique()
    test_unique = test[col].unique()
    if data_unique[~np.isnan(data_unique)].min() == test_unique[~np.isnan(test_unique)].min() and \
    data_unique[~np.isnan(data_unique)].max() == test_unique[~np.isnan(test_unique)].max() and \
    data_unique.shape[0] == test_unique.shape[0] and \
    data_unique.shape[0] <= 10:
        data[col].fillna(-1.0)
        test[col].fillna(-1.0)
        for num in data[col].unique():
            new_col = '{}={}'.format(col, num)
            data[new_col] = data[col].apply(lambda x: np.isnan(x) if np.isnan(num) else x==num)
            test[new_col] = test[col].apply(lambda x: np.isnan(x) if np.isnan(num) else x==num)
            data.drop([col], axis=1)
            test.drop([col], axis=1)

**** save the file locally to save the time of preprocessing ****

In [None]:
save_obj(data, 'train_onehot')
save_obj(test, 'test_onehot')

**** load the onehot encoding preprocessed dataset ****

In [None]:
# load the training set and test set
data = load_obj('train_onehot')
test = load_obj('test_onehot')

**** Perform normalization ****

In [None]:
# normalization with maxmin
norm_maxmin(data, 3)
norm_maxmin(test, 2)

In [None]:
# normalization with zscore
norm_standardize(data, 3)
norm_standardize(test, 2)

**** Sort the training data and remove the unlabeled data ****

In [None]:
# temporially ignore the rows without labels
data.sort_values('date',inplace=True)
# unlabeled = data[data['label']==-1]

In [None]:
# with the prediction on the unlabeled training data, set the labels for unlabeled data
data.loc[data['label']==-1,'label'] = pd.Series((pred_xgunlabeled>0.5).astype(int), name='label', index=data.loc[data['label']==-1,'label'].index)

In [5]:
data = data[data['label']!=-1]
data.reset_index(drop=True, inplace=True)

**** calculate the weight of each row with trained RF model ****

In [None]:
rf_model = load_obj('train_test_shift')
weights = rf_model.predict_proba(data.fillna(-1).iloc[:,3:].values)[:,1:]

print(weights.shape, data.shape)

In [6]:
weights = load_obj('weights')

weights.weight.values.argsort()

array([     0, 605741, 605743, ..., 707109, 162794, 879508], dtype=int64)

In [None]:
train = data.iloc[weights.weight.values.argsort()[weights.shape[0]//10-weights.shape[0]:],:]
valid = data.iloc[weights.weight.values.argsort()[:weights.shape[0]//10],:]

In [None]:
np.count_nonzero(weights)

In [None]:
weights.shape

In [None]:
weights = 1./weights - 1

**** incrementally train model ****

In [None]:
num_fraction = 10
fraction_size = data.shape[0]//10

x_batches = []
y_batches = []
unlabeled = []
for i in range(num_fraction):
    if i!=num_fraction-1:
        data_portion = data.iloc[i*fraction_size:(i+1)*fraction_size,:]
    else:
        data_portion = data.iloc[i*fraction_size:,:]
    unlabeled.append(data_portion[data_portion['label']==-1].iloc[:,3:])
    x_batches.append(data_portion[data_portion['label']!=-1].iloc[:,3:])
    y_batches.append(data_portion[data_portion['label']!=-1]['label'])

In [None]:
# initial model
# set up the parameters
params = {'max_depth': 8, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
params['nthread'] = 4
params['eval_metric'] = ['error', 'auc']
params["scale_pos_weight"] = 5
num_rounds = 50
early_stopping_rounds = 20

# set up the random seed for testing
params["seed"] = 10

xgtrain = xgb.DMatrix(x_batches[0].values, label=y_batches[0].values)
xgb_model = xgb.train(params, xgtrain,\
                      num_rounds,\
                      [(xgtrain, 'current'),\
                       (xgb.DMatrix(x_batches[1].values, label=y_batches[1].values), 'next')],\
                      # feval=get_ratio,\
                      early_stopping_rounds=early_stopping_rounds)

In [None]:
# predict the unlabeled data of the first fraction
pred_unlabeled = xgb_model.predict(xgb.DMatrix(unlabeled[0].values))
unlabeled[0]['label'] = pd.Series((pred_unlabeled>0.5).astype(int), name='label',\
                                      index = unlabeled[0].index)

In [None]:
# incrementally train the model
for fraction_id in range(1,num_fraction):
    print('=========================================================================')
    print('=====       '+str(num_fraction)+' fractions in total, training on the fraction {}'.format(fraction_id+1)+'       =====')
    print('=========================================================================')
#     xgtrain = xgb.DMatrix(x_batches[fraction_id].append(unlabeled[fraction_id-1].iloc[:,:-1]).values,\
#                           label=y_batches[fraction_id].append(unlabeled[fraction_id-1]['label']).values)
    xgtrain = xgb.DMatrix(x_batches[fraction_id].values,\
                          label=y_batches[fraction_id].values)
    if fraction_id != num_fraction-1:
        xgb_model = xgb.train(params, xgtrain,\
            num_rounds,\
            [(xgtrain, 'current'),\
            (xgb.DMatrix(x_batches[fraction_id+1].values, label=y_batches[fraction_id+1].values), 'next')],\
            # feval=my_score3,\
            early_stopping_rounds=early_stopping_rounds,\
            xgb_model = xgb_model)
#         pred_unlabeled = xgb_model.predict(xgb.DMatrix(unlabeled[fraction_id].values))
#         unlabeled[fraction_id]['label'] = pd.Series((pred_unlabeled>0.5).astype(int), name='label',\
#                                                     index = unlabeled[fraction_id].index)
    else:
        num_rounds = 50
        xgb_model = xgb.train(params, xgtrain,\
            num_rounds,\
            [(xgtrain, 'current')],\
            # feval=my_score3,\
            early_stopping_rounds=early_stopping_rounds,\
            xgb_model = xgb_model,)

**** Create validation set and training set ****

In [9]:
train = data.iloc[:len(data) * 8 // 10]
valid = data.iloc[len(data) * 8 // 10:]

In [10]:
train_weights = weights.weight.values[:len(data) * 8 // 10]
train_weights /= np.mean(train_weights) # Normalizing the weights

In [11]:
train_y = train[['label']]
train_x = train.iloc[:,3:]
valid_y = valid[['label']]
valid_x = valid.iloc[:,3:]

In [12]:
xgtrain = xgb.DMatrix(train_x.values, weight=train_weights, label=train_y.values)
xgval_1 = xgb.DMatrix(valid_x.iloc[:valid_x.shape[0] // 2,:].values,\
                      label=valid_y.iloc[:valid_x.shape[0] // 2,:].values)
xgval_2 = xgb.DMatrix(valid_x.iloc[valid_x.shape[0] // 2:,:].values,\
                      label=valid_y.iloc[valid_x.shape[0] // 2:,:].values)
xgval = xgb.DMatrix(valid_x.values,\
                      label=valid_y.iloc[:,:].values)
evallist = [(xgtrain, 'train'), (xgval_1, 'val_1'), (xgval_2, 'val_2'), (xgval, 'val')]

In [None]:
# normal xgtrain and xgval
xgtrain = xgb.DMatrix(train_x.values, label=train_y.values)
xgval_1 = xgb.DMatrix(valid_x.iloc[:valid_x.shape[0] // 2,:].values,\
                      label=valid_y.iloc[:valid_x.shape[0] // 2,:].values)
xgval_2 = xgb.DMatrix(valid_x.iloc[valid_x.shape[0] // 2:,:].values,\
                      label=valid_y.iloc[valid_x.shape[0] // 2:,:].values)
xgval = xgb.DMatrix(valid_x.values, label=valid_y.iloc[:,:].values)
evallist = [(xgtrain, 'train'), (xgval_1, 'val_1'), (xgval_2, 'val_2'), (xgval, 'val')]

**** Train the model ****

In [13]:
# set up the parameters
params = {'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'}
params['nthread'] = 4
params['eval_metric'] = ['logloss', 'auc']
params["colsample_bytree "] = 0.5
params["scale_pos_weight"] = 2
num_rounds = 300
early_stopping_rounds = 1000

# set up the random seed for testing
#params["seed"] = 8

In [None]:
%time xgb_model = xgb.train(params, xgtrain, num_rounds, evallist,\
    feval=my_score3, early_stopping_rounds=early_stopping_rounds)#, xgb_model = xgb_model)

[0]	train-logloss:0.602631	train-auc:0.92045	val_1-logloss:0.602057	val_1-auc:0.930811	val_2-logloss:0.602043	val_2-auc:0.926628	val-logloss:0.60205	val-auc:0.928731	train-score:0.398215	val_1-score:0.387134	val_2-score:0.415199	val-score:0.401028
Multiple eval metrics have been passed: 'val-score' will be used for early stopping.

Will train until val-score hasn't improved in 1000 rounds.
[1]	train-logloss:0.528425	train-auc:0.93055	val_1-logloss:0.527373	val_1-auc:0.937868	val_2-logloss:0.527334	val_2-auc:0.935917	val-logloss:0.527354	val-auc:0.936889	train-score:0.428844	val_1-score:0.414658	val_2-score:0.435797	val-score:0.431373
[2]	train-logloss:0.466518	train-auc:0.931526	val_1-logloss:0.465084	val_1-auc:0.937906	val_2-logloss:0.465028	val_2-auc:0.936388	val-logloss:0.465056	val-auc:0.937143	train-score:0.432281	val_1-score:0.412296	val_2-score:0.437209	val-score:0.429071
[3]	train-logloss:0.414196	train-auc:0.932817	val_1-logloss:0.412417	val_1-auc:0.938687	val_2-logloss:0.4124

**** save or load the model ****

In [None]:
# save the model
xgb_model.save_model('model_log/0018.model')
# dump model with feature map
xgb_model.dump_model('model_log/dumpraw0018.txt')

In [None]:
# load the model
xgb_model = xgb.Booster({'nthread': 4})  # init model
xgb_model.load_model('model_log/0017.model')  # load model

**** train the existing model on the validation set ****

In [None]:
xgval = xgb.DMatrix(valid_x.values, label=valid_y.values)

In [None]:
%time xgb_model = xgb.train(params, xgval, num_rounds, [(xgval, 'validation')], feval=my_score3, early_stopping_rounds=early_stopping_rounds, xgb_model = xgb_model)

**** predict on the testset ****

In [None]:
# load test data
test_whole = pd.read_feather("tmp/test_native")

In [None]:
xgtest = xgb.DMatrix(test.iloc[:,2:].values)

# make predictions
preds = xgb_model.predict(xgtest)#, ntree_limit=xgb_model.best_ntree_limit)

res = pd.concat([test.id, pd.Series(list(preds), name='score')], axis=1)

res.to_csv("submission/0018.csv", index=False)

In [None]:
# the positive ratio of the test data
print('Ratio of positive label in unlabeled data: {}%'.format((preds>0.5).sum()/preds.shape[0]*100))

**** predict on the unlabeled training set ****

In [None]:
xgunlabeled = xgb.DMatrix(unlabeled.iloc[:,3:].values)

In [None]:
unlabeled.reset_index(drop=True, inplace=True)

In [None]:
pred_xgunlabeled = xgb_model.predict(xgunlabeled)

In [None]:
# set the labels for the unlabeled training data
unlabeled['label'] = pd.Series((pred_xgunlabeled>0.5).astype(int), name='label')

In [None]:
res = pd.concat([unlabeled.id, pd.Series(list(pred_xgunlabeled), name='score')], axis=1)
res.to_csv("Yabin_unlabeled0011.csv", index=False)

In [None]:
# the positive ratio of the unlabeled data
print('Ratio of positive label in unlabeled data: {}%'.format((pred_xgunlabeled>0.5).sum()/res.shape[0]*100))