In [4]:
# coding: utf-8
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.model_selection import train_test_split
import gc
from gensim.models import Word2Vec, FastText
import os
import itertools
import multiprocessing
warnings.filterwarnings('ignore')
print(multiprocessing.cpu_count())
os.chdir(r"/content/drive/My Drive/Colab Notebooks/Datacastle/code")


2


In [12]:
def load_dataset(DATA_PATH):
    train_label = pd.read_csv(DATA_PATH+'train_label.csv')
    train_base = pd.read_csv(DATA_PATH+'train_base.csv')
    test_base = pd.read_csv(DATA_PATH+'test_a_base.csv')

    train_op = pd.read_csv(DATA_PATH+'train_op.csv')
    train_trans = pd.read_csv(DATA_PATH+'train_trans.csv')
    test_op = pd.read_csv(DATA_PATH+'test_a_op.csv')
    test_trans = pd.read_csv(DATA_PATH+'test_a_trans.csv')

    return train_label, train_base, test_base, train_op, train_trans, test_op, test_trans


def transform_time(x):
    day = int(x.split(' ')[0])
    hour = int(x.split(' ')[2].split('.')[0].split(':')[0])
    minute = int(x.split(' ')[2].split('.')[0].split(':')[1])
    second = int(x.split(' ')[2].split('.')[0].split(':')[2])
    return 86400*day+3600*hour+60*minute+second


def data_preprocess(DATA_PATH):
    train_label, train_base, test_base, train_op, train_trans, test_op, test_trans = load_dataset(DATA_PATH=DATA_PATH)
    # 拼接数据
    train_df = train_base.copy()
    test_df = test_base.copy()
    train_df = train_label.merge(train_df, on=['user'], how='left')
    del train_base, test_base

    op_df = pd.concat([train_op, test_op], axis=0, ignore_index=True)
    trans_df = pd.concat([train_trans, test_trans], axis=0, ignore_index=True)
    data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    del train_op, test_op, train_df, test_df
    # 时间维度的处理
    op_df['days_diff'] = op_df['tm_diff'].apply(lambda x: int(x.split(' ')[0]))
    trans_df['days_diff'] = trans_df['tm_diff'].apply(lambda x: int(x.split(' ')[0]))
    op_df['timestamp'] = op_df['tm_diff'].apply(lambda x: transform_time(x))
    trans_df['timestamp'] = trans_df['tm_diff'].apply(lambda x: transform_time(x))
    op_df['hour'] = op_df['tm_diff'].apply(lambda x: int(x.split(' ')[2].split('.')[0].split(':')[0]))
    trans_df['hour'] = trans_df['tm_diff'].apply(lambda x: int(x.split(' ')[2].split('.')[0].split(':')[0]))
    trans_df['week'] = trans_df['days_diff'].apply(lambda x: x % 7)
    # 排序
    trans_df = trans_df.sort_values(by=['user', 'timestamp'])
    op_df = op_df.sort_values(by=['user', 'timestamp'])
    trans_df.reset_index(inplace=True, drop=True)
    op_df.reset_index(inplace=True, drop=True)

    gc.collect()
    return data, op_df, trans_df

#用户其他特征的统计量
def gen_user_status_features(df,value):
    group_df = df.groupby(['user'])[value].agg(
        a='mean',
        b= 'std',
        c='max',
        d='min',
        e='sum',
        f='median',
        g='count',
        h='median',
        i='skew',
        ).reset_index()
    group_df['j']=group_df['b'] / group_df['a']
    group_df.rename(columns={
                    'a':'user_{}_mea'.format(value),
                    'b':'user_{}_std'.format(value),
                    'c':'user_{}_max'.format(value),
                    'd':'user_{}_min'.format(value),
                    'e':'user_{}_sum'.format(value),
                    'f':'user_{}_med'.format(value),
                    'g':'user_{}_cnt'.format(value),
                    'h':'user_{}_median'.format(value),
                    'i':'user_{}_skew'.format(value),
                    'j':'user_{}_CV'.format(value)
                    },
                   inplace=True
                   )
    return group_df
#用户所在城市的其他衍生特征
def gen_city_status_features(df,value):
    group_df = df.groupby(['city'])[value].agg(
        a='mean',
        b= 'std',
        c='max',
        d='min',
        e='sum',
        f='median',
        g='count',
        h='median',
        i='skew',
        ).reset_index()
    group_df['j']=group_df['b'] / group_df['a']
    group_df.rename(columns={
                    'a':'city_{}_mea'.format(value),
                    'b':'city_{}_std'.format(value),
                    'c':'city_{}_max'.format(value),
                    'd':'city_{}_min'.format(value),
                    'e':'city_{}_sum'.format(value),
                    'f':'city_{}_med'.format(value),
                    'g':'city_{}_cnt'.format(value),
                    'h':'city_{}_median'.format(value),
                    'i':'city_{}_skew'.format(value),
                    'j':'city_{}_CV'.format(value)
                    },
                   inplace=True
                   )
    return group_df


#trans#用户交易消费的特征
def gen_user_amount_group_features(df,col,val):
    df = df[ df[col]==val ]
    l = ['mean','std','max','min','sum','median','count','skew','var']
    group_df = df.groupby('user')['amount'].agg(l).reset_index()
    group_df[ 'user_'+col+'_'+val+'_amount_jicha'] = group_df['max']-group_df['min']
    rename_col = {}
    for i in l:
        rename_col[i] = 'user_'+col+'_'+val+'_amount_'+i
    group_df = group_df.rename(columns=rename_col)
    return group_df

#trans#用户交易消费的特征
def gen_user_amount_features(df):
    group_df = df.groupby(['user']).agg(
         user_amount_mean=('amount','mean'),
         user_amount_std=('amount','std'),
         user_amount_max=('amount','max'),
         user_amount_min=('amount','min'),
         user_amount_med=('amount','median'),
         user_amount_cnt=('amount','count'),
         user_amount_skew=('amount','skew'),
         user_amount_var=('amount','var'),
        ).reset_index()
    group_df['user_amount_jicha'] = group_df['user_amount_max'] - group_df['user_amount_min']
    return group_df

def gen_user_days_diff_group_features(df,col,val):
    df = df[ df[col]==val ]
    l = ['mean','std','max','min','sum','median','count','skew','var']
    group_df = df.groupby('user')['days_diff'].agg(l).reset_index()
    group_df[ 'user_'+col+'_'+val+'_days_diff_jicha'] = group_df['max']-group_df['min']
    rename_col = {}
    for i in l:
        rename_col[i] = 'user_'+col+'_'+val+'_days_diff_'+i
    group_df = group_df.rename(columns=rename_col)
    return group_df

#trans 用户在某平台或者用某ip的消费额特征
def gen_user_group_amount_features(df, value):
    group_df = df.pivot_table(index='user',
                              columns=value,
                              values='amount',
                              dropna=False,
                              aggfunc=['count','sum','median',"skew",'var','mean','std','max','min']).fillna(0)
    group_df.columns = ['user_{}_{}_amount_{}'.format(value, f[1], f[0]) for f in group_df.columns]
    group_df.reset_index(inplace=True)

    return group_df


#用户在某个时间段内的消费额特征 大于天数的窗口
def gen_user_window_amount_features(df, window):
    group_df = df[df['days_diff']>window].groupby('user').agg(
        mean=('amount','mean'),
        std=('amount','std'),
        max=('amount','max'),
        min=('amount','min'),
        median=('amount','median'),
        count=('amount','count'),
        skew=('amount','skew'),
        var=('amount','var'),
        ).reset_index().rename(columns={"mean":'user_amount_mean_{}d'.format(window),
                        "std":'user_amount_std_{}d'.format(window),
                        "max":'user_amount_max_{}d'.format(window),
                        "min":'user_amount_min_{}d'.format(window),
                        "sum":'user_amount_sum_{}d'.format(window),
                        "median":'user_amount_median_{}d'.format(window),
                        "count":'user_amount_count_{}d'.format(window),
                        "skew":'user_amount_skew_{}d'.format(window),
                        "var":'user_amount_var_{}d'.format(window),
                        })
    return group_df


#用户在某个时间段内的消费额特征 大于小时的窗口
def gen_user_hourwindow_amount_features(df, window):
    group_df = df[df['hour']>=window].groupby('user').agg(
        mean=('amount','mean'),
        count=('amount','count'),
        ).reset_index().rename(columns={"mean":'user_amount_mean_{}h'.format(window),
                        "count":'user_amount_count_{}h'.format(window),
                        })
    return group_df


#trans用户  ['days_diff', 'platform', 'tunnel_in', 'tunnel_out', 'type1', 'type2', 'ip', 'ip_3']交易用到各字段的类数
def gen_user_nunique_features(df, value, prefix):
    group_df = df.groupby(['user'])[value].agg(
        ['nunique','count']
    ).reset_index().rename(columns={"nunique":'user_{}_{}_nuniq'.format(prefix, value),
                                   'count':'user_{}_{}_cnt'.format(prefix, value)})
    return group_df


#trans用户无IP 这个特征贼奇怪 怎么会交易没有ip呢
def gen_user_null_features(df, value, prefix):
    df['is_null'] = 0
    df.loc[df[value].isnull(), 'is_null'] = 1

    group_df = df.groupby(['user'])['is_null'].agg(sum='sum',
                            mean='mean').reset_index().rename(columns={"sum":'user_{}_{}_null_cnt'.format(prefix, value),
                                                        "mean":'user_{}_{}_null_ratio'.format(prefix, value)})
    return group_df


# op op_mode op_tyep op_device net_type channel
def gen_user_tfidf_features(df, value):
    #填充缺失值
    df[value].replace(' ', np.nan, inplace=True)
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)

    #
    group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['user', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = TfidfVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_tfidf_{}_{}'.format(value, i) for i in range(10)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df


# op op_mode op_tyep op_device net_type channel
def gen_user_countvec_features(df, value):
    #填充缺失值
    df[value].replace(' ', np.nan, inplace=True)
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)
    #
    group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['user', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(value, i) for i in range(10)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df


#target_encode_cols = ['province', 'city', 'city_level', 'city_balance_avg']
#target_encode
def kfold_stats_feature(train, test, feats, k):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)  # 这里最好和后面模型的K折交叉验证保持一致

    train['fold'] = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
        train.loc[val_idx, 'fold'] = fold_

    kfold_features = []
    for feat in feats:
        nums_columns = ['label']
        #只有一个还用for?
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
                tmp_trn = train.iloc[trn_idx]
                order_label = tmp_trn.groupby([feat])[f].mean()

                tmp = train.loc[train.fold == fold_, [feat]]
                train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)#其实就是用kfold方式对该特征的label标签mean化，然后建个新列
                
                # fillna 平均值填充
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)

        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            # fillna
            global_mean = train[f].mean()
            test[colname] = test[colname].fillna(global_mean)
            test[colname] = test[colname].astype(float)
    del train['fold']
    return train, test


def w2v_feat(df, feat, mode):
    data_frame=df.copy()
    # 转化为str
    for i in feat:
        if data_frame[i].dtype != 'object':
            data_frame[i] = data_frame[i].astype(str)
    data_frame.fillna('nan', inplace=True) # 甜宠

    print(f'Start {mode} word2vec ...')
    model = Word2Vec(data_frame[feat].values.tolist(), size=10, window=2, min_count=1,
                     workers=multiprocessing.cpu_count(), iter=10)
    stat_list = ['min', 'max', 'mean', 'std']
    new_all = pd.DataFrame()
    for m, t in enumerate(feat):
        print(f'Start gen feat of {t} ...')
        tmp = []
        for i in data_frame[t].unique():
            tmp_v = [i]
            tmp_v.extend(model[i])
            tmp.append(tmp_v)
        tmp_df = pd.DataFrame(tmp)
        w2c_list = [f'w2c_{t}_{n}' for n in range(10)]
        tmp_df.columns = [t] + w2c_list
        tmp_df = data_frame[['user', t]].merge(tmp_df, on=t)
        tmp_df = tmp_df.drop_duplicates().groupby('user').agg(stat_list).reset_index()
        tmp_df.columns = ['user'] + [f'{p}_{q}' for p in w2c_list for q in stat_list]
        if m == 0:
            new_all = pd.concat([new_all, tmp_df], axis=1)
        else:
            new_all = pd.merge(new_all, tmp_df, how='left', on='user')
    return new_all


def fasttext(df, feat, mode):
    data_frame=df.copy()
    # 转化为str
    for i in feat:
        if data_frame[i].dtype != 'object':
            data_frame[i] = data_frame[i].astype(str)
    data_frame.fillna('nan', inplace=True) # 甜宠

    print(f'Start {mode} FastText ...')
    model = FastText(data_frame[feat].values.tolist(),  size=10, window=3, min_count=1, 
                     workers=multiprocessing.cpu_count(), iter=10,min_n = 3 , max_n = 6,word_ngrams = 0,seed=1)
    stat_list = ['min', 'max', 'mean', 'std']
    new_all = pd.DataFrame()
    for m, t in enumerate(feat):
        print(f'Start gen feat of {t} ...')
        tmp = []
        for i in data_frame[t].unique():
            tmp_v = [i]
            tmp_v.extend(model[i])
            tmp.append(tmp_v)
        tmp_df = pd.DataFrame(tmp)
        w2c_list = [f'fast_text_{t}_{n}' for n in range(10)]
        tmp_df.columns = [t] + w2c_list
        tmp_df = data_frame[['user', t]].merge(tmp_df, on=t)
        tmp_df = tmp_df.drop_duplicates().groupby('user').agg(stat_list).reset_index()
        tmp_df.columns = ['user'] + [f'{p}_{q}' for p in w2c_list for q in stat_list]
        if m == 0:
            new_all = pd.concat([new_all, tmp_df], axis=1)
        else:
            new_all = pd.merge(new_all, tmp_df, how='left', on='user')
    return new_all

def gen_features(df, op, trans):

    # base
    print("base")
    df['product7_fail_ratio'] = df['product7_fail_cnt'] / df['product7_cnt']
    df['city_count'] = df.groupby(['city'])['user'].transform('count')
    df['province_count'] = df.groupby(['province'])['user'].transform('count')
    df['op_cnt_avg'] = (df["op1_cnt"]+df["op2_cnt"])/df["using_time"]
    df['ip_cnt_avg'] = df["ip_cnt"]/df["using_time"]
    df['card_a_cnt_avg'] = df["card_a_cnt"]/df["using_time"]
    df['card_b_cnt_avg'] = df["card_b_cnt"]/df["using_time"]
    df['card_c_cnt_avg'] = df["card_c_cnt"]/df["using_time"]
    df['card_d_cnt_avg'] = df["card_d_cnt"]/df["using_time"]
    df['login_cnt_didive'] = df["login_cnt_period1"]/df["login_cnt_period2"]
    df['ip_per_day_cnt'] = df["ip_cnt"]/df["login_days_cnt"]
    df['acc_count_per_time'] = df["acc_count"]/df["using_time"]
    df['agreement_total_per_using_time'] = df["agreement_total"]/df["using_time"]
    df['login_cnt'] = df["using_time"]*df["login_cnt_avg"]
    df['agreement_total_per_account_cnt'] = df["agreement_total"]/df["acc_count"]
    df['ip_cnt_per_acc_count'] = df["ip_cnt"]/df["acc_count"]
    df['service1_amt_per_using_time'] = df["service1_amt"]/df["using_time"]
    df["service_cnt_per_time"]=(df["service1_cnt"]+df["service1_cnt"])/df["using_time"]
    
    df['login_cnt_period_var']=df[['login_cnt_period1','login_cnt_period2']].std(axis=1)
    df['login_cnt_period_CV']=df['login_cnt_period_var'] / df[['login_cnt_period1','login_cnt_period2']].mean(axis=1)
    df['card_var']=df[['card_a_cnt','card_b_cnt','card_c_cnt','card_d_cnt']].std(axis=1)
    df['card_CV']=df['card_var'] / df[['card_a_cnt','card_b_cnt','card_c_cnt','card_d_cnt']].mean(axis=1)
    df['op_cnt_var']=df[['op1_cnt','op2_cnt']].std(axis=1)
    df['op_cnt_CV']=df['op_cnt_var'] / df[['op1_cnt','op2_cnt']].mean(axis=1)
    df['login_var']=df[['login_cnt_period1','login_cnt_period2','login_cnt_avg']].std(axis=1)
    df['login_CV']=df['login_var'] / df[['login_cnt_period1','login_cnt_period2','login_cnt_avg']].mean(axis=1)
    df['ip_cnt_per_login_cnt_avg'] = df['ip_cnt']/df[['login_cnt_period1','login_cnt_period2','login_cnt_avg']].mean(axis=1)
    df['op_cnt_per_login_cnt_avg'] = (df["op1_cnt"]+df["op2_cnt"])/df['login_cnt_avg']

    # trans
    print("trans nunique & amount feature & user_status_features ")
    df = df.merge(gen_user_status_features(trans,'amount'), on=['user'], how='left')
    df = df.merge(gen_user_status_features(trans,'days_diff'), on=['user'], how='left')
    df = df.merge(gen_user_status_features(trans,'hour'), on=['user'], how='left')

    for col in tqdm(['days_diff','platform', 'tunnel_in', 'tunnel_out', 'type1', 'type2', 'ip', 'ip_3']):
        df = df.merge(gen_user_nunique_features(df=trans, value=col, prefix='trans'), on=['user'], how='left')
    df['user_amount_per_days'] = df['user_amount_sum'] / df['user_trans_days_diff_nuniq']
    df['user_amount_per_cnt'] = df['user_amount_sum'] / df['user_amount_cnt']
    df = df.merge(gen_user_amount_features(trans), on=['user'], how='left')

    temp=trans[trans['ip'].isnull()].groupby(['user'])['amount'].agg(user_null_ip_amount_sum='sum').reset_index()
    df=df.merge(temp,on=['user'],how='left')
    df['user_null_ip_amount_sum'].fillna(0,inplace=True)
    df['user_null_ip_amount_percent'] = df['user_null_ip_amount_sum'] / df['user_amount_sum']
    df['user_ip_amount_percent'] = 1-df['user_null_ip_amount_percent'] 
    df['user_amount_per_tunnel_in'] = df['user_amount_sum'] / df['user_trans_tunnel_in_nuniq']
    df['user_amount_per_platform'] = df['user_amount_sum'] / df['user_trans_platform_nuniq']

    # df=df.merge(df.groupby(['city'])['user_amount_sum'].agg(city_amount_mean='mean',city_amount_sum='sum',city_amount_std='std').reset_index(),on=['city'],how='left')
    # df=df.merge(gen_city_status_features(df,'user_trans_ip_null_cnt'),on=['city'],how='left')
    # df=df.merge(gen_city_status_features(df,'user_trans_ip_count'),on=['city'],how='left')
    # df['city_ipcnt_vs_nullipcnt']=df['city_user_trans_ip_count_sum'] / df['city_user_trans_ip_null_cnt_sum']
    # df=df.merge(gen_city_status_features(df,'user_null_ip_amount_sum'),on=['city'],how='left')
    # df['city_null_ip_amount_percent'] = df['city_user_null_ip_amount_sum_sum'] / df['city_amount_sum'] 

    print("trans window") 
    df = df.merge(gen_user_window_amount_features(df=trans, window=27), on=['user'], how='left')
    df = df.merge(gen_user_window_amount_features(df=trans, window=23), on=['user'], how='left')
    # df = df.merge(gen_user_window_amount_features(df=trans, window=19), on=['user'], how='left')
    df = df.merge(gen_user_window_amount_features(df=trans, window=15), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=20), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=15), on=['user'], how='left')
    df = df.merge(gen_user_hourwindow_amount_features(df=trans, window=10), on=['user'], how='left')
    print("trans null_features")
    df = df.merge(gen_user_null_features(df=trans, value='type2', prefix='trans'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=trans, value='ip_3', prefix='trans'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=trans, value='tunnel_in', prefix='trans'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=trans, value='tunnel_out', prefix='trans'), on=['user'], how='left')

    print("trans group_amount_features")
    print("type1")
    for i in ["674e8d5860bc033d",'443b0fd0860c21b6','45a1168437c708ff','f67d4b5a05a1352a','443b0fd0860c21b6',
              '3146295fbf43c0cb','8adb3dcfea9dcf5e','33e9d4cef01499e1','19d44f1a51919482','0a3cf8dac7dca9d1']:
        df = df.merge(gen_user_days_diff_group_features(trans,'type1',i), on=['user'], how='left')
        df = df.merge(gen_user_amount_group_features(trans,'type1',i), on=['user'], how='left')
    
    print("type2")
    for i in ['11a213398ee0c623','2ee592ab06090eb5','b5a8be737a50b171','2ee592ab06090eb5']:
        df = df.merge(gen_user_days_diff_group_features(trans,'type1',i), on=['user'], how='left')
        df = df.merge(gen_user_amount_group_features(trans,'type1',i), on=['user'], how='left')
        
    print("tunnel_in")
    for i in ['b2e7fa260df4998d']:
        df = df.merge(gen_user_days_diff_group_features(trans,'type1',i), on=['user'], how='left')
        df = df.merge(gen_user_amount_group_features(trans,'type1',i), on=['user'], how='left')
        
    print("tunnel_out")
    for i in ['6ee790756007e69a','4c8524fb01d8b204']:
        df = df.merge(gen_user_days_diff_group_features(trans,'type1',i), on=['user'], how='left')
        df = df.merge(gen_user_amount_group_features(trans,'type1',i), on=['user'], how='left')
    
    df = df.merge(gen_user_tfidf_features(df=trans, value='amount'), on=['user'], how='left')
    df = df.merge(gen_user_countvec_features(df=trans, value='amount'), on=['user'], how='left')
    
    df=df.merge(fasttext(trans,['amount',"type1","type2"],"trans"),on=['user'],how='left')

    print("op")
    # op
    print("op null features")
    df = df.merge(gen_user_null_features(df=op, value="net_type", prefix='op'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=op, value="op_device", prefix='op'), on=['user'], how='left')
    df = df.merge(gen_user_null_features(df=op, value="ip", prefix='op'), on=['user'], how='left') 

    df=df.merge(op.groupby("user").agg(user_op_days_diff_mean=('days_diff','mean')),on=['user'], how='left')
    df=df.merge(op.groupby("user").agg(user_op_hour_mean=('hour','mean')),on=['user'], how='left')

    df=df.merge(op.groupby("user").agg(user_op_tm_diff_count=('tm_diff','count')),on=['user'], how='left')

    print("op nunique features")
    for col in tqdm(['days_diff','hour', 'op_mode', 'op_type', 'op_device',"ip",'channel',"ip_3",]):
        df = df.merge(gen_user_nunique_features(df=op, value=col, prefix='op'), on=['user'], how='left')

    print("op tfidf countvec encoder features")
    df = df.merge(gen_user_tfidf_features(df=op, value='op_mode'), on=['user'], how='left')
    df = df.merge(gen_user_tfidf_features(df=op, value='op_type'), on=['user'], how='left')

    df = df.merge(gen_user_countvec_features(df=op, value='op_mode'), on=['user'], how='left')
    df = df.merge(gen_user_countvec_features(df=op, value='op_type'), on=['user'], how='left')

    print("op fasttext encoder features")
    df=df.merge(fasttext(op,["op_mode","op_type","op_device","channel",'ip_3',"net_type","ip"],"op"),on=['user'],how='left')

    print("LabelEncoder")
    for col in tqdm([f for f in df.select_dtypes('object').columns if f not in ['user']]):
        le = LabelEncoder()
        df[col] = df[col].apply(lambda x:str(x))
        df[col].fillna('-1', inplace=True)
        df[col] = le.fit_transform(df[col])

    return df


def lgb_model(train, target, test, k):
    feats = [f for f in train.columns if f not in ['user', 'label']]
    print('Current num of features:', len(feats))
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)
    oof_probs = np.zeros(train.shape[0])
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
        'learning_rate': 0.03,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'min_data_in_leaf': 20,
        'reg_alpha':10,
        'reg_lambda':8,
        'verbose': -1,
        'nthread': 8,
        'colsample_bytree':0.77,
        'min_child_weight':4,
        'min_child_samples':10,
        'min_split_gain':0,
        'lambda_l1': 0.8,
    }

    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

        dtrain = lgb.Dataset(train_X,
                             label=train_y)
        dval = lgb.Dataset(test_X,
                           label=test_y)
        lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=5000,
                valid_sets=[dval],
                early_stopping_rounds=100,
                verbose_eval=100,
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)
        offline_score.append(lgb_model.best_score['valid_0']['auc'])
        output_preds += lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration)/folds.n_splits
        print(offline_score)
        # feature importance
        fold_importance_df = pd.DataFrame()
        print(fold_importance_df.shape)
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(60))
    feature_importance_df.groupby(['feature'])['importance'].mean().to_csv("../submission/feature_importance.csv")
    return output_preds, oof_probs, np.mean(offline_score),feature_importance_df



In [13]:
%%time
DATA_PATH = '../data/'
print('读取数据...')
data, op_df, trans_df = data_preprocess(DATA_PATH=DATA_PATH)

print('开始特征工程...')
data = gen_features(data, op_df, trans_df)


读取数据...
开始特征工程...
base
trans nunique & amount feature & user_status_features 


100%|██████████| 8/8 [00:02<00:00,  2.73it/s]


trans window
trans null_features
trans group_amount_features
type1
type2
tunnel_in
tunnel_out
Start trans FastText ...
Start gen feat of amount ...
Start gen feat of type1 ...
Start gen feat of type2 ...
op
op null features


  0%|          | 0/8 [00:00<?, ?it/s]

op nunique features


100%|██████████| 8/8 [00:12<00:00,  1.60s/it]


op tfidf countvec encoder features
op fasttext encoder features
Start op FastText ...
Start gen feat of op_mode ...
Start gen feat of op_type ...
Start gen feat of op_device ...
Start gen feat of channel ...
Start gen feat of ip_3 ...
Start gen feat of net_type ...
Start gen feat of ip ...
LabelEncoder


100%|██████████| 25/25 [00:01<00:00, 24.67it/s]

CPU times: user 45min 8s, sys: 26.1 s, total: 45min 34s
Wall time: 28min 15s





In [14]:

data['city_level'] = data['city'].map(str) + '_' + data['level'].map(str)
data['city_product1_amount'] = data['city'].map(str) + '_' + data['product1_amount'].map(str)
data['city_balance1_avg'] = data['city'].map(str) + '_' + data['balance1_avg'].map(str)
data['city_balance2_avg'] = data['city'].map(str) + '_' + data['balance2_avg'].map(str)
data['city_balance'] = data['city'].map(str) + '_' + data['balance'].map(str)
data['city_card_a_cnt'] = data['city'].map(str) + '_' + data['card_a_cnt'].map(str)


train = data[~data['label'].isnull()].copy()
target = train['label']
test = data[data['label'].isnull()].copy()

print("target encoder...")
  
target_encode_cols = ['province','city',"city_level",'city_balance1_avg',
                      'city_balance2_avg',"city_product1_amount",'city_balance','city_card_a_cnt']

train, test = kfold_stats_feature(train, test, target_encode_cols, 10)
train.drop(['province', "city","service3_level","city_level",'city_balance1_avg','city_balance2_avg',
              "city_product1_amount",'city_balance','city_card_a_cnt'], axis=1, inplace=True)
test.drop(['province',"city","service3_level","city_level",'city_balance1_avg','city_balance2_avg',
              "city_product1_amount",'city_balance','city_card_a_cnt'], axis=1, inplace=True)

target encoder...


In [15]:
%%time
print('开始模型训练...')

lgb_preds, lgb_oof, lgb_score,feature_importance_df = lgb_model(train=train, target=target, test=test,k=10)
auc_score = roc_auc_score(target.values, lgb_oof)
print("train auc:",auc_score)

sub_df = test[['user']].copy()
sub_df['prob'] = lgb_preds
sub_df.to_csv('../submission/sub_lgb{%.5f}.csv'%(lgb_score,), index=False)
# 0.740557 (no_day 23 27) 0.70053
#val_auc0.740481 on_auc0.700902
#0.740219 0.70***6

开始模型训练...
Current num of features: 1003
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.74395
[200]	valid_0's auc: 0.74777
[300]	valid_0's auc: 0.748324
Early stopping, best iteration is:
[299]	valid_0's auc: 0.748353
[0.7483525374247579]
(0, 0)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.747535
[200]	valid_0's auc: 0.749378
[300]	valid_0's auc: 0.751296
[400]	valid_0's auc: 0.751794
[500]	valid_0's auc: 0.751316
Early stopping, best iteration is:
[404]	valid_0's auc: 0.752082
[0.7483525374247579, 0.7520821654426307]
(0, 0)
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.735997
[200]	valid_0's auc: 0.740904
[300]	valid_0's auc: 0.741653
[400]	valid_0's auc: 0.742894
[500]	valid_0's auc: 0.742944
Early stopping, best iteration is:
[486]	valid_0's auc: 0.743111
[0.7483525374247579, 0.7520821654426307, 0.7431114412375024]
(0, 0)
Training until validation scores don't impro

In [16]:

#coding=utf-8

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
'''单变量特征选取'''
from sklearn.feature_selection import SelectKBest, chi2
'''去除方差小的特征'''
from sklearn.feature_selection import VarianceThreshold
'''循环特征选取'''
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
'''RFE_CV'''
from sklearn.ensemble import ExtraTreesClassifier


class FeatureSelection(object):
    def __init__(self,train, label, test, feature_list,feature_num):
        self.feature_name = feature_list     # feature name #
        self.feature_num = feature_num
        self.train, self.label, self.test = train.fillna(-1), label, test.fillna(-1)   # features #
        #归一化方便进行特征筛选
        standard=MinMaxScaler()
        self.train=standard.fit_transform(self.train[self.feature_name])
    def variance_threshold(self):
        sel = VarianceThreshold()
        sel.fit_transform(self.train)
        feature_var = list(sel.variances_)    # feature variance #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)   # 100 cols #
        return set(features)   # return set type #

    def select_k_best(self):
        ch2 = SelectKBest(chi2, k=self.feature_num)
        ch2.fit(self.train, self.label)
        feature_var = list(ch2.scores_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)     # 100 cols #
        return set(features)    # return set type #

    def svc_select(self):
        svc = SVC(kernel='rbf', C=1, random_state=2020,verbose=1)    # linear #
        rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1,verbose=1)
        rfe.fit(self.train, self.label.ravel())
        print(rfe.ranking_)
        return rfe.ranking_

    def tree_select(self):
        clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=2,verbose=1)
        clf.fit(self.train, self.label)
        feature_var = list(clf.feature_importances_)  # feature scores #
        features = dict(zip(self.feature_name, feature_var))
        features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
        print(features)     # 100 cols #
        return set(features)  # return set type #

    def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False):
        names = set([])
        if variance_threshold is True:
            name_one = self.variance_threshold()
            names = names.union(name_one)
        if select_k_best is True:
            name_two = self.select_k_best()
            names = names.intersection(name_two)
        if svc_select is True:
            name_three = self.svc_select()
            names = names.intersection(name_three)
        if tree_select is True:
            name_four = self.tree_select()
            names = names.intersection(name_four)

        # print(len(names))
        print(names)
        return list(names)

feats = [f for f in train.columns if f not in ['user', 'label']]
selection = FeatureSelection(train,target,test,feats,900)
selection_feature=selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
len(selection_feature)

ValueError: ignored