In [7]:
# coding: utf-8
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.model_selection import train_test_split
import gc
import os
import multiprocessing
warnings.filterwarnings('ignore')
os.chdir(r"/content/drive/My Drive/Colab Notebooks/Datacastle/code")
print(multiprocessing.cpu_count())


4


In [8]:
def load_dataset(DATA_PATH):
    train_label = pd.read_csv(DATA_PATH+'train_label.csv')
    train_base = pd.read_csv(DATA_PATH+'train_base.csv')
    test_base = pd.read_csv(DATA_PATH+'test_a_base.csv')

    train_op = pd.read_csv(DATA_PATH+'train_op.csv')
    train_trans = pd.read_csv(DATA_PATH+'train_trans.csv')
    test_op = pd.read_csv(DATA_PATH+'test_a_op.csv')
    test_trans = pd.read_csv(DATA_PATH+'test_a_trans.csv')

    return train_label, train_base, test_base, train_op, train_trans, test_op, test_trans

In [9]:
def transform_time(x):
    day = int(x.split(' ')[0])
    hour = int(x.split(' ')[2].split('.')[0].split(':')[0])
    minute = int(x.split(' ')[2].split('.')[0].split(':')[1])
    second = int(x.split(' ')[2].split('.')[0].split(':')[2])
    return 86400*day+3600*hour+60*minute+second


In [10]:
def data_preprocess(DATA_PATH):
    train_label, train_base, test_base, train_op, train_trans, test_op, test_trans = load_dataset(DATA_PATH=DATA_PATH)
    # 拼接数据
    train_df = train_base.copy()
    test_df = test_base.copy()
    train_df = train_label.merge(train_df, on=['user'], how='left')
    del train_base, test_base

    op_df = pd.concat([train_op, test_op], axis=0, ignore_index=True)
    trans_df = pd.concat([train_trans, test_trans], axis=0, ignore_index=True)
    data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    del train_op, test_op, train_df, test_df
    # 时间维度的处理
    op_df['days_diff'] = op_df['tm_diff'].apply(lambda x: int(x.split(' ')[0]))
    trans_df['days_diff'] = trans_df['tm_diff'].apply(lambda x: int(x.split(' ')[0]))
    op_df['timestamp'] = op_df['tm_diff'].apply(lambda x: transform_time(x))
    trans_df['timestamp'] = trans_df['tm_diff'].apply(lambda x: transform_time(x))
    op_df['hour'] = op_df['tm_diff'].apply(lambda x: int(x.split(' ')[2].split('.')[0].split(':')[0]))
    trans_df['hour'] = trans_df['tm_diff'].apply(lambda x: int(x.split(' ')[2].split('.')[0].split(':')[0]))
    trans_df['week'] = trans_df['days_diff'].apply(lambda x: x % 7)
    # 排序
    trans_df = trans_df.sort_values(by=['user', 'timestamp'])
    op_df = op_df.sort_values(by=['user', 'timestamp'])
    trans_df.reset_index(inplace=True, drop=True)
    op_df.reset_index(inplace=True, drop=True)

    gc.collect()
    return data, op_df, trans_df


In [11]:
#用户单特征的统计特征
def gen_user_single_features(df,col):
    group_df = df.groupby(['user']).agg(
         mean=(col,'mean'),
         std=(col,'std'),
         max=(col,'max'),
         min=(col,'min'),
         sum=(col,'sum'),
         median=(col,'median'),
         count=(col,'count'),
         skew=(col,'skew'),
         var=(col,'var'),
        ).reset_index().rename(columns={"mean":'user_{}_mean'.format(col),
                        "std":'user_{}_std'.format(col),
                        "max":'user_{}_max'.format(col),
                        "min":'user_{}_min'.format(window),
                        "sum":'user_{}_sum'.format(col),
                        "median":'user_{}_median'.format(col),
                        "count":'user_{}_count'.format(col),
                        "skew":'user_{}_skew'.format(col),
                        "var":'user_{}_var'.format(col),
                        })
    return group_df

In [12]:
#trans#用户交易消费的特征
def gen_user_amount_features(df):
    group_df = df.groupby(['user']).agg(
         user_amount_mean=('amount','mean'),
         user_amount_std=('amount','std'),
         user_amount_max=('amount','max'),
         user_amount_min=('amount','min'),
         user_amount_sum=('amount','sum'),
         user_amount_med=('amount','median'),
         user_amount_cnt=('amount','count'),
         user_amount_skew=('amount','skew'),
         user_amount_var=('amount','var'),
        ).reset_index()
    return group_df


In [13]:
#trans 用户在某平台或者用某ip的消费额特征
def gen_user_group_amount_features(df, value):
    group_df = df.pivot_table(index='user',
                              columns=value,
                              values='amount',
                              dropna=False,
                              aggfunc=['count','sum','median',"skew",'var','mean','std','max','min']).fillna(0)
    group_df.columns = ['user_{}_{}_amount_{}'.format(value, f[1], f[0]) for f in group_df.columns]
    group_df.reset_index(inplace=True)

    return group_df


In [14]:
#用户在某个时间段内的消费额特征 大于天数的窗口
def gen_user_window_amount_features(df, window):
    group_df = df[df['days_diff']>window].groupby('user').agg(
        mean=('amount','mean'),
        std=('amount','std'),
        max=('amount','max'),
        min=('amount','min'),
        median=('amount','median'),
        count=('amount','count'),
        skew=('amount','skew'),
        var=('amount','var'),
        ).reset_index().rename(columns={"mean":'user_amount_mean_{}d'.format(window),
                        "std":'user_amount_std_{}d'.format(window),
                        "max":'user_amount_max_{}d'.format(window),
                        "min":'user_amount_min_{}d'.format(window),
                        "sum":'user_amount_sum_{}d'.format(window),
                        "median":'user_amount_median_{}d'.format(window),
                        "count":'user_amount_count_{}d'.format(window),
                        "skew":'user_amount_skew_{}d'.format(window),
                        "var":'user_amount_var_{}d'.format(window),
                        })
    return group_df


In [15]:
#用户在某个时间段内的消费额特征 大于小时的窗口
def gen_user_hourwindow_amount_features(df, window):
    group_df = df[df['hour']>=window].groupby('user').agg(
        mean=('amount','mean'),
        count=('amount','count'),
        ).reset_index().rename(columns={"mean":'user_amount_mean_{}h'.format(window),
                        "count":'user_amount_count_{}h'.format(window),
                        })
    return group_df

In [16]:
#trans用户  ['days_diff', 'platform', 'tunnel_in', 'tunnel_out', 'type1', 'type2', 'ip', 'ip_3']交易用到各字段的类数
def gen_user_nunique_features(df, value, prefix):
    group_df = df.groupby(['user']).agg(
        nunique=(value,'nunique')
    ).reset_index().rename(columns={"nunique":'user_{}_{}_nuniq'.format(prefix, value)})
    return group_df


In [17]:
#trans用户无IP 这个特征贼奇怪 怎么会交易没有ip呢
def gen_user_null_features(df, value, prefix):
    df['is_null'] = 0
    df.loc[df[value].isnull(), 'is_null'] = 1

    group_df = df.groupby(['user'])['is_null'].agg(sum=('is_null','sum'),
                            mean=('is_null','mean')).reset_index().rename(columns={"sum":'user_{}_{}_null_cnt'.format(prefix, value),
                                                        "mean":'user_{}_{}_null_ratio'.format(prefix, value)})
    return group_df


In [18]:
# op op_mode op_tyep op_device net_type channel
def gen_user_tfidf_features(df, value):
    #填充缺失值
    df[value].replace(' ', np.nan, inplace=True)
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)

    #
    group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['user', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = TfidfVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_tfidf_{}_{}'.format(value, i) for i in range(10)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df


In [19]:
# op op_mode op_tyep op_device net_type channel
def gen_user_countvec_features(df, value):
    #填充缺失值
    df[value].replace(' ', np.nan, inplace=True)
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)
    #
    group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['user', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(value, i) for i in range(10)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df


In [20]:
#target_encode_cols = ['province', 'city', 'city_level', 'city_balance_avg']
#target_encode
def kfold_stats_feature(train, test, feats, k):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)  # 这里最好和后面模型的K折交叉验证保持一致

    train['fold'] = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
        train.loc[val_idx, 'fold'] = fold_

    kfold_features = []
    for feat in feats:
        nums_columns = ['label']
        #只有一个还用for?
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
                tmp_trn = train.iloc[trn_idx]
                order_label = tmp_trn.groupby([feat])[f].mean()

                tmp = train.loc[train.fold == fold_, [feat]]
                train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
                
                # fillna
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)

        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            # fillna
            global_mean = train[f].mean()
            test[colname] = test[colname].fillna(global_mean)
            test[colname] = test[colname].astype(float)
    del train['fold']
    return train, test


In [21]:

def gen_features(df, op, trans):
    #df.drop(['service3_level'], axis=1, inplace=True)

    # base
    df['product7_fail_ratio'] = df['product7_fail_cnt'] / df['product7_cnt']
    df['city_count'] = df.groupby(['city'])['user'].transform('count')
    df['province_count'] = df.groupby(['province'])['user'].transform('count')
    df['op_cnt_avg'] = (df["op1_cnt"]+df["op2_cnt"])/df["using_time"]
    df['ip_cnt_avg'] = df["ip_cnt"]/df["using_time"]
    # trans
    df = df.merge(gen_user_amount_features(trans), on=['user'], how='left')
    for col in tqdm(['days_diff','platform', 'tunnel_in', 'tunnel_out', 'type1', 'type2', 'ip', 'ip_3']):
        df = df.merge(gen_user_nunique_features(df=trans, value=col, prefix='trans'), on=['user'], how='left')
    df['user_amount_per_days'] = df['user_amount_sum'] / df['user_trans_days_diff_nuniq']
    df['user_amount_per_cnt'] = df['user_amount_sum'] / df['user_amount_cnt']
    print("trans group_amount_features") 
    df = df.merge(gen_user_group_amount_features(df=trans, value='platform'), on=['user'], how='left')
    df = df.merge(gen_user_group_amount_features(df=trans, value='type1'), on=['user'], how='left')
    df = df.merge(gen_user_group_amount_features(df=trans, value='type2'), on=['user'], how='left')
    df = df.merge(gen_user_group_amount_features(df=trans, value='tunnel_in'), on=['user'], how='left')
    df = df.merge(gen_user_group_amount_features(df=trans, value='tunnel_out'), on=['user'], how='left')
    print("trans window") 
    df = df.merge(gen_user_window_amount_features(df=trans, window=27), on=['user'], how='left')
    df = df.merge(gen_user_window_amount_features(df=trans, window=23), on=['user'], how='left')
    df = df.merge(gen_user_window_amount_features(df=trans, window=15), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=20), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=15), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=10), on=['user'], how='left')
    print("trans null_features")
    df = df.merge(gen_user_null_features(df=trans, value='ip', prefix='trans'), on=['user'], how='left')
    # df = df.merge(gen_user_null_features(df=trans, value='ip_3', prefix='trans'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=trans, value='tunnel_in', prefix='trans'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=trans, value='tunnel_out', prefix='trans'), on=['user'], how='left')

    print("trans type")
    group_df = trans[trans['type1']=='45a1168437c708ff'].groupby(['user']).agg(user_type1_45a1_min_day=('days_diff','min')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')
    group_df = trans[trans['type1']=='45a1168437c708ff'].groupby(['user']).agg(user_type1_45a1_skew_day=('days_diff','skew')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')

    group_df = trans[trans['type1']=='fc9b75cf62ba8b8f'].groupby(['user']).agg(user_type1_fc9b_skew_day=('days_diff','skew')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')
    group_df = trans[trans['type1']=='fc9b75cf62ba8b8f'].groupby(['user']).agg(user_type1_fc9b_mean_day=('days_diff','mean')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')

    group_df = trans[trans['type2']=='2ee592ab06090eb5'].groupby(['user']).agg(user_type1_2ee5_skew_day=('days_diff','skew')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')
    group_df = trans[trans['type2']=='2ee592ab06090eb5'].groupby(['user']).agg(user_type1_2ee5_mean_day=('days_diff','mean')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')

    group_df = trans[trans['type2']=='2bf61669e40ef6b8'].groupby(['user']).agg(user_type1_2bf6_skew_day=('days_diff','skew')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')
    group_df = trans[trans['type2']=='2bf61669e40ef6b8'].groupby(['user']).agg(user_type1_2bf6_mean_day=('days_diff','mean')).reset_index()
    df = df.merge(group_df, on=['user'], how='left')

    print("op")
    # op
    df = df.merge(gen_user_null_features(df=op, value="net_type", prefix='op'), on=['user'], how='left')
    df=df.merge(op.groupby("user").agg(user_op_days_diff_mean=('days_diff','mean')),on=['user'], how='left')
    df=df.merge(op.groupby("user").agg(user_op_hour_mean=('hour','mean')),on=['user'], how='left')

    for col in tqdm(['days_diff','hour', 'op_mode', 'op_type', 'op_device', 'channel', 'ip']):
      df = df.merge(gen_user_nunique_features(df=op, value=col, prefix='op'), on=['user'], how='left')
    df = df.merge(gen_user_tfidf_features(df=op, value='op_mode'), on=['user'], how='left')
    df = df.merge(gen_user_tfidf_features(df=op, value='op_type'), on=['user'], how='left')
    df = df.merge(gen_user_tfidf_features(df=op, value='op_device'), on=['user'], how='left')
    df = df.merge(gen_user_tfidf_features(df=op, value='ip_3'), on=['user'], how='left')

    df = df.merge(gen_user_countvec_features(df=op, value='op_mode'), on=['user'], how='left')
    df = df.merge(gen_user_countvec_features(df=op, value='op_type'), on=['user'], how='left')
    df = df.merge(gen_user_countvec_features(df=op, value='op_device'), on=['user'], how='left')
    df = df.merge(gen_user_countvec_features(df=op, value='ip_3'), on=['user'], how='left')

    # LabelEncoder
    print("LabelEncoder")
    cat_cols = []
    for col in tqdm([f for f in df.select_dtypes('object').columns if f not in ['user']]):
        le = LabelEncoder()
        df[col].fillna('-1', inplace=True)
        df[col] = le.fit_transform(df[col])
        cat_cols.append(col)

    return df


In [22]:
#特别牛的归一化
import numpy as np
from collections import Counter, OrderedDict

class RGN:
    '''Rank Gaussian Normalization'''
    def __init__(self, data=None, precision=np.float32):
        #data: 1D array or list
        self._data = data
        self.precision = precision        
        self._output = None
        if self._data is None:
            self._trafo_map = None
        else:
            self.fit_transform(self._data)

    @property
    def data(self):
        return self._data

    @property
    def output(self):
        return self._output

    @property
    def precision(self):
        return self._precision

    @precision.setter
    def precision(self, p):
        if not isinstance(p, type):
            raise ValueError('precision must be a data type, e.g.: np.float64')
        self._precision = p

    def _RationalApproximation(self, t:float)->float: 
        c = [2.515517, 0.802853, 0.010328]
        d = [1.432788, 0.189269, 0.001308]
        return t - ((c[2]*t + c[1])*t + c[0]) / (((d[2]*t + d[1])*t + d[0])*t + 1.0)

    def _NormalCDFInverse(self, p:float) -> float:

        if (p <= 0.0 or p >= 1.0):
            raise Exception('0<p<1. The value of p was: {}'.format(p))
        if (p < 0.5):
            return -self._RationalApproximation(np.sqrt(-2.0*np.log(p)) )
        return self._RationalApproximation( np.sqrt(-2.0*np.log(1-p)) )

    def _vdErfInvSingle01(self, x:float) -> float:
        if x == 0:
            return 0
        elif x < 0:
            return -self._NormalCDFInverse(-x)*0.7
        else:
            return self._NormalCDFInverse(x)*0.7

    def fit_transform(self, dataIn:list) -> dict:
        self.fit(dataIn)
        return self.transform(dataIn)

    def fit(self, dataIn:list):
        self._data = dataIn
        trafoMap = OrderedDict()
        hist = Counter(dataIn)
        if len(hist) == 0:
            pass
        elif len(hist) == 1:
            key = list(hist.keys())[0]
            trafoMap[key] = 0.0
        elif len(hist) == 2:
            keys = sorted(list(hist.keys()))
            trafoMap[keys[0]] = 0.0
            trafoMap[keys[1]] = 1.0
        else:
            N = cnt = 0
            for it in hist:
                N += hist[it]
            assert (N == len(dataIn))
            mean = 0.0
            for it in sorted(list(hist.keys())):
                rankV = cnt / N
                rankV = rankV * 0.998 + 1e-3
                rankV = self._vdErfInvSingle01(rankV)
                assert(rankV >= -3.0 and rankV <= 3.0)
                mean += hist[it] * rankV
                trafoMap[it] = rankV
                cnt += hist[it]
            mean /= N
            for it in trafoMap:
                trafoMap[it] -= mean
        self._trafo_map = trafoMap
        return 

    def _binary_search(self, keys, val):
        start, end = 0, len(keys)-1
        while start+1 < end:
            mid = (start + end) // 2
            if val < keys[mid]:
                end = mid
            else:
                start = mid
        return keys[start], keys[end]

    def transform(self, dataIn:list) -> dict:
        dataOut = []
        trafoMap = self._trafo_map
        keys = list(trafoMap.keys())
        if len(keys) == 0:
            raise Exception('No transfermation map')
        for i in range(len(dataIn)):
            val = dataIn[i]
            trafoVal = 0.0
            if val <= keys[0]:
                trafoVal = trafoMap[keys[0]]
            elif val >= keys[-1]:
                trafoVal = trafoMap[keys[-1]]
            elif val in trafoMap:
                trafoVal = trafoMap[val]
            else:
                lower_key, upper_key = self._binary_search(keys, val)
                x1, y1 = lower_key, trafoMap[lower_key]
                x2, y2 = upper_key, trafoMap[upper_key]

                trafoVal = y1 + (val - x1) * (y2 - y1) / (x2 - x1)
            dataOut.append(trafoVal)
        dataOut = np.asarray(dataOut, dtype=self.precision)
        self._output = dataOut
        return self._output 

# if __name__ == '__main__':
#     data = [-19.9378,10.5341,-32.4515,33.0969,24.3530,-1.1830,-1.4106,-4.9431,
#         14.2153,26.3700,-7.6760,60.3346,36.2992,-126.8806,14.2488,-5.0821,
#         1.6958,-21.2168,-49.1075,-8.3084,-1.5748,3.7900,-2.1561,4.0756,
#         -9.0289,-13.9533,-9.8466,79.5876,-13.3332,-111.9568,-24.2531,120.1174]
#     rgn = RGN(data)
#     print(rgn.output)

In [23]:
def lgb_model(train, target, test, k):
    feats = [f for f in train.columns if f not in ['user', 'label']]
    print('Current num of features:', len(feats))
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)
    oof_probs = np.zeros(train.shape[0])
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
        'learning_rate': 0.03,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'min_data_in_leaf': 20,
        'reg_alpha':10,
        'reg_lambda':8,
        'verbose': -1,
        'nthread': 8,
        'colsample_bytree':0.77,
        'min_child_weight':4,
        'min_child_samples':10,
        'min_split_gain':0,
        'lambda_l1': 0.8,
    }

    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

        dtrain = lgb.Dataset(train_X,
                             label=train_y)
        dval = lgb.Dataset(test_X,
                           label=test_y)
        lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=5000,
                valid_sets=[dval],
                early_stopping_rounds=100,
                verbose_eval=100,
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)
        offline_score.append(lgb_model.best_score['valid_0']['auc'])
        output_preds += lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration)/folds.n_splits
        print(offline_score)
        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(50))
    feature_importance_df.to_csv("../data/feature_importance.csv")
    return output_preds, oof_probs, np.mean(offline_score),feature_importance_df

In [24]:
DATA_PATH = '../data/'
print('读取数据...')
data, op_df, trans_df = data_preprocess(DATA_PATH=DATA_PATH)

print('开始特征工程...')
data = gen_features(data, op_df, trans_df)
data['city_level'] = data['city'].map(str) + '_' + data['level'].map(str)
data['city_product1_amount'] = data['city'].map(str) + '_' + data['product1_amount'].map(str)
data['city_balance1_avg'] = data['city'].map(str) + '_' + data['balance1_avg'].map(str)
data['city_balance2_avg'] = data['city'].map(str) + '_' + data['balance2_avg'].map(str)
data['city_balance'] = data['city'].map(str) + '_' + data['balance'].map(str)

train = data[~data['label'].isnull()].copy()
target = train['label']
test = data[data['label'].isnull()].copy()

target_encode_cols = ['province','city',"city_level",'city_balance1_avg','city_balance2_avg',
                      "city_product1_amount",'city_balance']
train, test = kfold_stats_feature(train, test, target_encode_cols, 10)
train.drop(['province', "city","service3_level","city_level",'city_balance1_avg','city_balance2_avg',
                      "city_product1_amount",'city_balance'], axis=1, inplace=True)
test.drop(['province',"city","service3_level","city_level",'city_balance1_avg','city_balance2_avg',
                      "city_product1_amount",'city_balance'], axis=1, inplace=True)

# print("开始数据归一化...")
# for feat in tqdm([f for f in train.columns if f not in ['user', 'label']]):
#   rgn = RGN(train[feat].fillna(-1).values)
#   train[feat]=rgn.output
#   rgn = RGN(test[feat].fillna(-1).values)
#   test[feat]=rgn.output



读取数据...
开始特征工程...


100%|██████████| 8/8 [00:03<00:00,  2.22it/s]


trans group_amount_features
trans window
trans null_features
trans type
op


100%|██████████| 7/7 [00:11<00:00,  1.70s/it]


LabelEncoder


100%|██████████| 25/25 [00:00<00:00, 38.16it/s]


In [25]:
print('开始模型训练...')
# param = {
#         'learning_rate': 0.05,
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric': 'auc',
#         'num_leaves': 50,
#         'feature_fraction': 0.8,
#         'bagging_fraction': 0.8,
#         'min_data_in_leaf': 20,
#         'reg_alpha':10,
#         'reg_lambda':8,
#         'verbose': -1,
#         'nthread': 8,
#         'colsample_bytree':0.77,
#         'min_child_weight':4,
#         'min_child_samples':10,
#         'min_split_gain':0,
#         'lambda_l1': 0.8,
#     }
lgb_preds, lgb_oof, lgb_score,feature_importance_df = lgb_model(train=train, target=target, test=test,k=10)
auc_score = roc_auc_score(target.values, lgb_oof)
print("train auc:",auc_score)

sub_df = test[['user']].copy()
sub_df['prob'] = lgb_preds
sub_df.to_csv('../submission/sub_lgb{%.5f}.csv'%(lgb_score,), index=False)

开始模型训练...
Current num of features: 649
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.733582
[200]	valid_0's auc: 0.738989
[300]	valid_0's auc: 0.739266
Early stopping, best iteration is:
[262]	valid_0's auc: 0.739823
[0.7398232616953695]
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.743729
[200]	valid_0's auc: 0.749188
[300]	valid_0's auc: 0.751161
[400]	valid_0's auc: 0.75079
Early stopping, best iteration is:
[334]	valid_0's auc: 0.7514
[0.7398232616953695, 0.7513999365390391]
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.729819
[200]	valid_0's auc: 0.73312
[300]	valid_0's auc: 0.733816
[400]	valid_0's auc: 0.733493
Early stopping, best iteration is:
[326]	valid_0's auc: 0.734034
[0.7398232616953695, 0.7513999365390391, 0.7340341445010375]
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.734024
[200]	valid_0's auc: 0.739857
[300]

In [129]:

#coding=utf-8

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
'''单变量特征选取'''
from sklearn.feature_selection import SelectKBest, chi2
'''去除方差小的特征'''
from sklearn.feature_selection import VarianceThreshold
'''循环特征选取'''
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
'''RFE_CV'''
from sklearn.ensemble import ExtraTreesClassifier


class FeatureSelection(object):
    def __init__(self,train, label, test, feature_list,feature_num):
        self.feature_name = feature_list     # feature name #
        self.feature_num = feature_num
        self.train, self.label, self.test = train.fillna(-1), label, test.fillna(-1)   # features #
        #归一化方便进行特征筛选
        standard=MinMaxScaler()
        self.train=standard.fit_transform(self.train[self.feature_name])
    def variance_threshold(self):
        sel = VarianceThreshold()
        sel.fit_transform(self.train)
        feature_var = list(sel.variances_)    # feature variance #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)   # 100 cols #
        return set(features)   # return set type #

    def select_k_best(self):
        ch2 = SelectKBest(chi2, k=self.feature_num)
        ch2.fit(self.train, self.label)
        feature_var = list(ch2.scores_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)     # 100 cols #
        return set(features)    # return set type #

    def svc_select(self):
        svc = SVC(kernel='rbf', C=1, random_state=2020,verbose=1)    # linear #
        rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1,verbose=1)
        rfe.fit(self.train, self.label.ravel())
        print(rfe.ranking_)
        return rfe.ranking_

    def tree_select(self):
        clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=2,verbose=1)
        clf.fit(self.train, self.label)
        feature_var = list(clf.feature_importances_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)     # 100 cols #
        return set(features)  # return set type #

    def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False):
        names = set([])
        if variance_threshold is True:
            name_one = self.variance_threshold()
            names = names.union(name_one)
        if select_k_best is True:
            name_two = self.select_k_best()
            names = names.intersection(name_two)
        if svc_select is True:
            name_three = self.svc_select()
            names = names.intersection(name_three)
        if tree_select is True:
            name_four = self.tree_select()
            names = names.intersection(name_four)

        # print(len(names))
        print(names)
        return list(names)

feats = [f for f in train.columns if f not in ['user', 'label']]
selection = FeatureSelection(train,target,test,feats,510)
selection_feature=selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
len(selection_feature)

['user_type2_2dd805cd09533f85_amount_mean', 'user_platform_71b24e4fd9a658ee_amount_std', 'acc_count', 'user_tunnel_in_162a612b764132df_amount_mean', 'user_type2_2bf61669e40ef6b8_amount_var', 'user_type1_8adb3dcfea9dcf5e_amount_sum', 'user_type1_674e8d5860bc033d_amount_var', 'user_type2_b26bc49195bd79cf_amount_min', 'user_amount_max_27d', 'user_type2_2dd805cd09533f85_amount_count', 'user_amount_std_27d', 'user_platform_fe8686492bb72dd4_amount_min', 'user_platform_80f6d63f26a56315_amount_median', 'user_tunnel_in_8aa6e57420e0299a_amount_median', 'user_platform_cc79bc9b7f4885de_amount_sum', 'user_type2_b26bc49195bd79cf_amount_std', 'user_type1_cd31880f9fa923ea_amount_min', 'user_type1_b26bc49195bd79cf_amount_std', 'user_type1_71b24e4fd9a658ee_amount_min', 'user_type1_cd31880f9fa923ea_amount_median', 'user_type1_e5cd41dfa47665b1_amount_var', 'user_type1_fc9b_mean_day', 'user_tunnel_in_162a612b764132df_amount_std', 'user_type1_cd31880f9fa923ea_amount_mean', 'user_type2_b26bc49195bd79cf_amoun

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    6.4s


['user_type1_0f99a10a1331ce14_amount_max', 'user_type1_81abaafd1ae512dd_amount_sum', 'agreement1', 'user_type2_2bf61669e40ef6b8_amount_sum', 'user_type2_cd31880f9fa923ea_amount_min', 'user_platform_71b24e4fd9a658ee_amount_var', 'user_type2_2dd805cd09533f85_amount_skew', 'user_type1_2bf6_mean_day', 'user_platform_fe8686492bb72dd4_amount_max', 'user_tunnel_out_b131ac74aa38a121_amount_max', 'user_type1_81abaafd1ae512dd_amount_std', 'user_type1_e5cd41dfa47665b1_amount_max', 'user_type1_443b0fd0860c21b6_amount_var', 'user_type1_fc9b75cf62ba8b8f_amount_var', 'user_type2_2bf61669e40ef6b8_amount_max', 'product5_amount', 'user_type2_2bf61669e40ef6b8_amount_median', 'user_type1_fc4eca960f6c690c_amount_max', 'user_type2_a906ce5a502c748e_amount_median', 'user_platform_71b24e4fd9a658ee_amount_mean', 'user_platform_80f6d63f26a56315_amount_std', 'user_type1_81abaafd1ae512dd_amount_min', 'user_type2_a906ce5a502c748e_amount_max', 'user_type1_2a2edd435db5ac70_amount_var', 'user_type1_33e9d4cef01499e1_am

[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:    9.8s finished


389

In [None]:
def train_xgb_module(train,target,test,features_name, k,store_result=False):
    '''训练模型'''
    print('Current num of features:', len(features_name))
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)
    oof_probs = np.zeros(train.shape[0])
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    print('读取数据完毕。。。')

    print('开始训练xgboost模型。。。')
    '''xgboost分类器'''
    params = {
        'booster': 'gbtree',
        'max_depth': 7,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'eta': 0.03,
        'silent': 1,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'min_child_weight': 4,
        'scale_pos_weight': 1,
        'seed': 2020,
        'reg_alpha': 0.5,
        'reg_lambda':3,
    }
    '''训练集'''
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
      train_y, test_y = target[train_index], target[test_index]
      train_X, test_X = train[features_name].iloc[train_index, :], train[features_name].iloc[test_index, :]

      dtrain = xgb.DMatrix(train_X,
                             label=train_y)
      dval = xgb.DMatrix(test_X,
                           label=test_y)
      watchlist = [(dtrain,'train'),(dval,'val')]
      module = xgb.train(params, dtrain,                 
                num_boost_round=5000,
                evals=watchlist,
                early_stopping_rounds=100,
                verbose_eval=100)
      oof_probs[test_index] = module.predict(xgb.DMatrix(train[features_name].iloc[test_index]), ntree_limit=module.best_iteration)
      offline_score.append(module.best_score)
      output_preds += module.predict(xgb.DMatrix(test[features_name]), ntree_limit=module.best_iteration)/folds.n_splits
      print(offline_score)
      # feature importance
      features=module.get_fscore()
      features=list(dict(sorted(features.items(), key=lambda d: d[1])).keys())
      fold_importance_df = pd.DataFrame(features)
      print(fold_importance_df)
      fold_importance_df["fold"] = i + 1

    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    return output_preds, oof_probs, np.mean(offline_score),feature_importance_df

feats = [f for f in train.columns if f not in ['user', 'label']]

xgb_preds, xgb_oof, xgb_score,feature_importance_df = train_xgb_module(train,target,test,selection_feature, 5,store_result=False)
auc_score = roc_auc_score(target.values, xgb_oof)
print("train auc:",auc_score)

sub_df = test[['user']].copy()
sub_df['prob'] = xgb_preds
sub_df.to_csv('../submission/sub_xgb{%.5f}.csv'%(xgb_score,), index=False)

Current num of features: 392
读取数据完毕。。。
开始训练xgboost模型。。。
[0]	train-auc:0.690244	val-auc:0.679654
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[100]	train-auc:0.756208	val-auc:0.727463
[200]	train-auc:0.785253	val-auc:0.736046
[300]	train-auc:0.809562	val-auc:0.737996
[400]	train-auc:0.827875	val-auc:0.738851
[500]	train-auc:0.845246	val-auc:0.739344
[600]	train-auc:0.860092	val-auc:0.739008
Stopping. Best iteration:
[504]	train-auc:0.846085	val-auc:0.739372

[0.739372]
                                               0
0         user_type2_2a2edd435db5ac70_amount_max
1    user_tunnel_out_4c8524fb01d8b204_amount_std
2        user_type1_33e9d4cef01499e1_amount_skew
3        user_type2_b5a8be737a50b171_amount_skew
4         user_type2_81abaafd1ae512dd_amount_min
..                                           ...
248        city_product1_amount_label_kfold_mean
249                          svd_tfidf_o

In [None]:
def train_lr_module(train,test,feat, store_result=False):
    train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
    validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
    test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
    train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
    print('读取数据完毕。。。')

    validate_label = validate_feature[['target']]
    train_label = train_feature[['target']]
    train_test_label = train_test_feature[['target']]

    train_feature = train_feature[features_name]
    test_feature = test_feature[features_name]
    validate_feature = validate_feature[features_name]
    train_test_feature = train_test_feature[features_name]

    print('开始训练logisticRegression模型。。。')
    module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4)  # , solver='sag'

    '''训练集'''
    module.fit(train_feature, train_label)

    if store_result is True:
        '''测试训练集'''
        module_two = LogisticRegression(
            penalty='l2',
            solver='sag',
            max_iter=500,
            random_state=42,
            n_jobs=4
        )
        module_two.fit(train_test_feature, train_test_label)

        result = module_two.predict_proba(test_feature)[:, 1]
        result = pd.DataFrame(result)
        result.columns = ['predicted_score']
        test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
        sample = test_list[['id']]
        sample['predicted_score'] = [index for index in result['predicted_score']]
        sample.columns = ['ID', 'PROB']
        sample.to_csv(r'lr_sample.csv', index=None)
        # sample.to_csv(r'lgb_sample.csv', index=None)
        print(sample)
        print('结果已更新。。。')

    print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1]))
    print('特征维数：', len(features_name))


In [None]:
#stacking and bagging model
class SBBTree():
	"""Stacking,Bootstap,Bagging----SBBTree"""
	def __init__(self, params, stacking_num, bagging_num, bagging_test_size, num_boost_round, early_stopping_rounds):
		"""
		Initializes the SBBTree.
        Args:
          params : lgb params
          stacking_num : k_flod stacking.
          bagging_num : bootstrap num.
          bagging_test_size : bootstrap sample rate.
          num_boost_round : boost num.
		  early_stopping_rounds : early_stopping_rounds.
        """
		self.params = params
		self.stacking_num = stacking_num
		self.bagging_num = bagging_num
		self.bagging_test_size = bagging_test_size
		self.num_boost_round = num_boost_round
		self.early_stopping_rounds = early_stopping_rounds

		self.model = lgb
		self.stacking_model = []
		self.bagging_model = []
		self.offline_score=[]
	def fit(self, X, y):
		""" fit model. """
		if self.stacking_num > 1:
			layer_train = np.zeros((X.shape[0], 2))
			self.SK = StratifiedKFold(n_splits=self.stacking_num, shuffle=True, random_state=2020)
			for k,(train_index, test_index) in enumerate(self.SK.split(X, y)):
				X_train = X[train_index]
				y_train = y[train_index]
				X_test = X[test_index]
				y_test = y[test_index]

				lgb_train = lgb.Dataset(X_train, y_train)
				lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

				gbm = lgb.train(self.params,
							lgb_train,
							num_boost_round=self.num_boost_round,
							valid_sets=lgb_eval,
							early_stopping_rounds=self.early_stopping_rounds,
              verbose_eval=100)

				self.stacking_model.append(gbm)

				pred_y = gbm.predict(X_test, num_iteration=gbm.best_iteration)
				layer_train[test_index, 1] = pred_y
				self.offline_score.append(gbm.best_score['valid_0']['auc'])
				print(self.offline_score)
			X = np.hstack((X, layer_train[:,1].reshape((-1,1)))) 
		else:
			pass
		
		bagging_params = {
		'task': 'train',
		'boosting_type': 'gbdt',
		'objective': 'binary',
		'metric': 'auc',
		'num_leaves': 10,
		'learning_rate': 0.03,
		'feature_fraction': 0.9,
		'bagging_fraction': 0.8,
		'bagging_freq': 5,
		'min_data': 20,
		'min_hessian': 1,
		'verbose': -1,
		}
		for bn in range(self.bagging_num):
			X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.bagging_test_size, random_state=bn)
	
			lgb_train = lgb.Dataset(X_train, y_train)
			lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

			gbm = lgb.train(bagging_params,
						lgb_train,
						num_boost_round=10000,
						valid_sets=lgb_eval,
						early_stopping_rounds=200,
            verbose_eval=100)

			self.bagging_model.append(gbm)
	 
		print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(self.offline_score), np.std(self.offline_score)))
	
	def predict(self, X_pred):
		""" predict test data. """
		if self.stacking_num > 1:
			test_pred = np.zeros((X_pred.shape[0], self.stacking_num))
			for sn,gbm in enumerate(self.stacking_model):
				pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
				test_pred[:, sn] = pred
			X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1,1))))  
		else:
			pass 
		for bn,gbm in enumerate(self.bagging_model):
			pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
			if bn == 0:
				pred_out=pred
			else:
				pred_out+=pred
		return pred_out/self.bagging_num

params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'min_data_in_leaf': 20,
        'reg_alpha':10,
        'reg_lambda':8,
        'verbose': -1,
        'nthread': 8,
        'colsample_bytree':0.77,
        'min_child_weight':4,
        'min_child_samples':10,
        'min_split_gain':0,
        'lambda_l1': 0.8,
    }

parameters = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 63,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'min_data_in_leaf': 20,
        'verbose': -1,
        'nthread': 8,
    }
feats = [f for f in train.columns if f not in ['user', 'label']]
model = SBBTree(parameters, stacking_num=5, bagging_num=3, bagging_test_size=0.20, num_boost_round=10000, early_stopping_rounds=100)
model.fit(train[feats].values, target.values)
pred=model.predict(test[feats].values) 
pred1=model.predict(train[feats].values) 
auc_score = roc_auc_score(target.values, pred1)
print('auc: ',auc_score) 
sub_df = test[['user']].copy()
sub_df['prob'] = pred
sub_df.to_csv('submission/stacking_sub{%.5f}.csv'%(auc_score,), index=False)
"""
# test 1
print("test1 ing...")
model = SBBTree(params, stacking_num=5, bagging_num=3, bagging_test_size=0.20, num_boost_round=10000, early_stopping_rounds=200)
model.fit(X_train,y_train)
pred1=model.predict(X_test)


fpr, tpr, thresholds = metrics.roc_curve(y_test+1, pred1, pos_label=2)
print('auc: ',metrics.auc(fpr, tpr))

"""

In [None]:
# selection features 特征数太多要花的时间太久了 特征不好筛选
feats = [f for f in train.columns if f not in ['user', 'label']]
print("features selection")
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
clf =RandomForestClassifier(n_estimators=300,n_jobs=multiprocessing.cpu_count(),verbose=1)
selector = RFECV(estimator=clf, step=1, cv=StratifiedKFold(3), scoring='roc_auc',verbose=1)
selector.fit(train[feats].fillna(-1).values, target.fillna(-1).values)
 
print(' Optimal number of features: %d' % selector.n_features_)
sel_features = [f for f, s in zip(features_col, selector.support_) if s]
sel_features

In [None]:
from gensim.models import Word2Vec
import multiprocessing
def w2v_feat(df, feat, mode):
    data_frame=df.copy()
    for i in feat:
        if data_frame[i].dtype != 'object':
            data_frame[i] = data_frame[i].astype(str)
    data_frame.fillna('nan', inplace=True)

    print(f'Start {mode} word2vec ...')
    model = Word2Vec(data_frame[feat].values.tolist(), size=10, window=2, min_count=1,
                     workers=multiprocessing.cpu_count(), iter=10)
    stat_list = ['min', 'max', 'mean', 'std']
    new_all = pd.DataFrame()
    for m, t in enumerate(feat):
        print(f'Start gen feat of {t} ...')
        tmp = []
        for i in data_frame[t].unique():
            tmp_v = [i]
            tmp_v.extend(model[i])
            tmp.append(tmp_v)
        tmp_df = pd.DataFrame(tmp)
        w2c_list = [f'w2c_{t}_{n}' for n in range(10)]
        tmp_df.columns = [t] + w2c_list
        tmp_df = data_frame[['user', t]].merge(tmp_df, on=t)
        tmp_df = tmp_df.drop_duplicates().groupby('user').agg(stat_list).reset_index()
        tmp_df.columns = ['user'] + [f'{p}_{q}' for p in w2c_list for q in stat_list]
        if m == 0:
            new_all = pd.concat([new_all, tmp_df], axis=1)
        else:
            new_all = pd.merge(new_all, tmp_df, how='left', on='user')
    return new_all
