In [None]:
import gc
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.model_selection import StratifiedKFold, KFold
from matplotlib.pyplot import plot, show
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score as auc
from matplotlib.pyplot import plot, show
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score,f1_score,accuracy_score, recall_score, precision_score

import warnings
warnings.filterwarnings("ignore")

#设置随机种子
import random
np.random.seed(42)  
random.seed(42)  

In [None]:
def cal_F2(y_true, y_pred, thes=0.5):
    y_true = y_true.apply(lambda x:0 if x==0 else 1)
    y_pred = y_pred.apply(lambda x:0 if x<thes else 1)
    r = recall_score(y_true, y_pred)
    p = precision_score(y_true, y_pred)
    print(f'precision_score:{p}, recall_score:{r}, F2:{5*p*r/(4*p+r)}')
    return 5*p*r/(4*p+r)

def cal_mape(y_true, y_pred):
    #y_true = y_true.apply(lambda x:0 if x==0 else 1)
    #y_pred = y_pred.apply(lambda x:0 if x==0 else 1)
    mape_score = mape(y_true, y_pred)
    return mape_score

#压缩数据的方法（节省内存占用） from kaggle
def reduce_mem(df, cols):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(cols):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

# 绘制用户浏览行为时序图
def plot_user_act(K=5, LIST=None):
    colors = np.array( [(0,0,1),(1,0.5,0),(1,0,0)] )

    for k in range(K):
        u = np.random.choice(LIST)
        tmp = train_view.loc[train_view.cust_wid==u]
        tmp['day'] = pd.to_datetime(tmp['acs_tm']).dt.day
        tmp['hour'] = pd.to_datetime(tmp['acs_tm']).dt.hour
        
        plt.figure(figsize=(20,5))
        xx = np.random.uniform(-0.3,0.3,len(tmp))
        yy = np.random.uniform(-0.5,0.5,len(tmp))
        plt.scatter(tmp.day.values+xx, tmp.hour.values+yy, s=25, c='g')
        plt.ylim((0,24))
        plt.xlim((0,30))
        c1 = mpatches.Patch(color=colors[0], label='Click page')
        plt.plot([0,30],[6-0.5,6-0.5],'--',color='gray')
        plt.plot([0,30],[21+0.5,21+0.5],'--',color='gray')
        for k in range(0,30):
            plt.plot([k+0.5,k+0.5],[0,24],'--',color='gray')
        for k in range(1,5):
            plt.plot([7*k+0.5,7*k+0.5],[0,24],'--',color='black')
        plt.legend(handles=[c1])
        plt.xlabel('Day of August 2022',size=16)
        plt.xticks([1,5,10,15,20,25,29],['Mon\nAug 1st','Fri\nAug 5th','Wed\nAug 10th','Mon\nAug 15th','Sat\nAug 20th','Thr\nAug 25th','Mon\nAug 29th'])
        plt.ylabel('Hour of Day',size=16)
        plt.yticks([0,4,8,12,16,20,24],['midnight','4am','8am','noon','4pm','8pm','midnight'])
        plt.show()
        print('\n\n')
        

# view表统计特征提取
def get_view_feats(df, view):
    dfs = []
    
    tmp = view.groupby('cust_wid')['cust_wid'].agg('count')
    tmp.name = tmp.name + '_count_view'
    dfs.append(tmp)
    
    tmp = view.groupby('cust_wid')['page_id'].agg('nunique')
    tmp.name = tmp.name + '_nunique_view'
    dfs.append(tmp)
    
    #每个user有多少天存在浏览记录
    tmp = view.groupby('cust_wid')['acs_day'].agg('nunique')
    tmp.name = tmp.name + '_nunique_view'
    dfs.append(tmp)
    
    #单日最高、平均浏览次数
    tmp_view = view.drop_duplicates(subset=['cust_wid','acs_day'])
    for s in ['mean', 'max', 'std']:
        tmp = tmp_view.groupby('cust_wid')['acs_day_count'].agg(s)
        tmp.name = tmp.name + f'_{s}_view'
        dfs.append(tmp)
    del tmp_view
    
    #时间间隔大于一小时的浏览数据，“有效浏览”统计特征
    tmp_view = view[view.acs_timestamp_diff>3600]
    tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
    tmp.name = tmp.name + '_count_view_great_1h'
    dfs.append(tmp)
    tmp = tmp_view.groupby('cust_wid')['page_id'].agg('nunique')
    tmp.name = tmp.name + '_nunique_view_great_1h'
    dfs.append(tmp)
    
    
    #最近16天统计
    tmp_view = view[view.acs_day>15]
    tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
    tmp.name = tmp.name + '_count_view_resent_15day'
    dfs.append(tmp)
    tmp = tmp_view.groupby('cust_wid')['page_id'].agg('nunique')
    tmp.name = tmp.name + '_nunique_view_resent_15day'
    dfs.append(tmp)
    
    #时间戳统计特征
    for s in ['mean', 'max', 'std', 'skew']:
        tmp = view.groupby('cust_wid')['acs_timestamp_diff'].agg(s)
        tmp.name = tmp.name + f'_{s}_view'
        dfs.append(tmp)
        
        
#     # 凌晨浏览统计
#     tmp_view = view[view.acs_hour.isin([0,1,2,3,4,5,6,7,21,22,23])]
#     tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
#     tmp.name = tmp.name + '_count_view_lc'
#     dfs.append(tmp)
    
#     # 上午浏览统计
#     tmp_view = view[view.acs_hour.isin([8,9,10,11])]
#     tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
#     tmp.name = tmp.name + '_count_view_sw'
#     dfs.append(tmp)
    
#     # 中午浏览统计
#     tmp_view = view[view.acs_hour.isin([12,13,14])]
#     tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
#     tmp.name = tmp.name + '_count_view_zw'
#     dfs.append(tmp)
    
#     # 下午浏览统计
#     tmp_view = view[view.acs_hour.isin([15,16,17,18,19,20])]
#     tmp = tmp_view.groupby('cust_wid')['cust_wid'].agg('count')
#     tmp.name = tmp.name + '_count_view_xw'
#     dfs.append(tmp)
    
    
    feat = pd.concat(dfs,axis=1)
    feat = feat.reset_index()
    df = df.merge(feat,on='cust_wid',how='left')
    
    return df


# trx表统计特征提取
def get_trx_feats(df, trx):
    dfs = []
    
    tmp = trx.groupby('cust_wid')['cust_wid'].agg('count')
    tmp.name = tmp.name + '_count_trx'
    dfs.append(tmp)
    
    tmp = trx.groupby('cust_wid')['trx_cd'].agg('nunique')
    tmp.name = tmp.name + '_nunique_trx'
    dfs.append(tmp)
    
    #trx['trx_amt']填充缺失
    for s in ['sum','mean', 'min', 'max', 'std']:
        tmp = trx.groupby('cust_wid')['trx_amt'].agg(s)
        tmp.name = tmp.name + f'_{s}_trx'
        dfs.append(tmp)
        
    for s in ['std', 'skew']:
        tmp = trx.groupby('cust_wid')['trx_timestamp_diff'].agg(s)
        tmp.name = tmp.name + f'_{s}_trx'
        dfs.append(tmp)
        
    trx_1 = trx[trx.trx_amt>=0].copy()
    trx_2 = trx[trx.trx_amt<0].copy()
    for s in ['sum']:
        tmp = trx_1.groupby('cust_wid')['trx_amt'].agg(s)
        tmp.name = tmp.name + f'_{s}_trx1'
        dfs.append(tmp)
    for s in ['sum']:
        tmp = trx_2.groupby('cust_wid')['trx_amt'].agg(s)
        tmp.name = tmp.name + f'_{s}_trx2'
        dfs.append(tmp)
    feat = pd.concat(dfs,axis=1)
    feat = feat.reset_index()
    df = df.merge(feat,on='cust_wid',how='left')
    
    return df

# 训练lgb模型
def train_model(df_train=None, stage=1, test_mode=False, aug_train=None):
    obj = {1:'binary', 2:'multiclass'}
    metr = {1:'auc', 2:'multi_logloss'}
    # obj = {1:'binary', 2:'binary'}
    # metr = {1:'auc', 2:'auc'}
    LABEL = 'bilabel' if stage==1 else 'new_label'
    print(LABEL)
    
    params = {
        'learning_rate': 0.03,
        'boosting_type': 'gbdt',
        'objective': obj[stage],
        'metric': metr[stage],
        'num_leaves': 32,
        'verbose': -1,
        'seed': 2222,
        'n_jobs': -1,

        'min_child_weight': 5,
        'max_depth':6,   
        'lambda_l1':0.2,
        'lambda_l2':0.2,

        'feature_fraction': 0.2,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
    }
    if stage==2:
        params['num_class'] = 14
    

    fold_num = 5
    seeds = [2222]
    oof = np.zeros(len(df_train)) if stage==1 else np.zeros([len(df_train),14])
  
    importance = 0
    pred_y = pd.DataFrame()
    score = []
    for seed in seeds:
        kf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
        # kf = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
        for fold, (train_idx, val_idx) in enumerate(kf.split(df_train[feats], df_train[LABEL])):
            print('-----------', fold)

            trn = df_train.loc[train_idx, :]
            val = df_train.loc[val_idx, :]
            trn = pd.concat([trn, aug_train]).reset_index(drop=True)

            train = lgb.Dataset(trn[feats],
                                trn[LABEL])
            valid = lgb.Dataset(val[feats],
                                val[LABEL])
            model = lgb.train(params, train, valid_sets=[train,valid], 
                              valid_names=('train', 'valid'),
                              num_boost_round=3000,#categorical_feature=cat_cols,
                              callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)])
            if stage==1:
                joblib.dump(model, f'tmp/lgbm_{fold}.pkl')
            else:
                joblib.dump(model, f'tmp/stage2_lgbm_{fold}.pkl')

            oof[val_idx] += model.predict(val[feats], num_iteration=model.best_iteration) / len(seeds)
            if test_mode:
                pred_y['fold_%d_seed_%d' % (fold, seed)] = model.predict(df_test[feats])
            importance += model.feature_importance(importance_type='gain') / fold_num
            #score.append(auc(df_train.loc[val_idx, LABEL], model.predict(val[feats])))
    df_train['oof'] = oof
    if test_mode:
        df_test['oof']  = pred_y.mean(axis=1).values

    #print(np.mean(score), np.std(score))
    #print(auc(df_train[LABEL],oof))
    if test_mode:
        return df_train, df_test, importance
    else:
        return df_train, importance, oof

In [None]:
train      = pd.read_csv('../data/train_base.csv')
# testa      = pd.read_csv('data/testa_base.csv')
testb      = pd.read_csv('../data/testb_base.csv')

view = pd.read_feather('../view.feather')  #APP浏览数据
trx  = pd.read_feather('../trx.feather')   #收支交易数据

In [None]:
df = pd.concat([testb, train],ignore_index=True)
cat_cols = []
for f in ['gdr_cd','cty_cd']:
    le = LabelEncoder()
    df[f] = le.fit_transform(df[f])
    cat_cols.append(f)    
df = df.sort_values(by=['cust_wid']).reset_index(drop=True)

In [None]:
view['acs_timestamp'] = view['acs_tm'].apply(lambda x:int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M"))))
trx['trx_timestamp']  = trx['trx_tm'].apply(lambda x:int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M"))))
view['acs_timestamp_diff'] = view.groupby('cust_wid')['acs_timestamp'].diff()
trx['trx_timestamp_diff'] = trx.groupby('cust_wid')['trx_timestamp'].diff()

view['acs_day'] = view['acs_tm'].apply(lambda x:int(x[8:10]))
view['acs_hour']= view['acs_tm'].apply(lambda x:int(x[11:13]))
view['acs_day_count'] = view.groupby(['cust_wid','acs_day'])['acs_day'].transform('count')

view = reduce_mem(view, view.columns)
trx = reduce_mem(trx, trx.columns)

In [None]:
df = get_view_feats(df, view)
df = get_trx_feats(df, trx)

In [None]:
############################################Word2Vec##########################################
# testa_view = pd.read_csv('data/testa_view.csv', encoding='gbk') 
# testa_view = testa_view.sort_values(by=['cust_wid','acs_tm'])
size_dict = {'page_id_list':64}
for f in tqdm(['page_id_list']):
    sentences = view.groupby('cust_wid')['page_id'].agg(list).to_list()#+testa_view.groupby('cust_wid')['page_id'].agg(list).to_list()
    for i in tqdm(range(len(sentences))):   #将每个tagid列表 转换为字符串，存储在列表sentences中
        sentences[i] = [str(x) for x in sentences[i]]
    emb_size = size_dict[f]
    #sg:训练算法：1表示skip-gram,否则CBOW，默认sg=0为CBOW算法。
    model = Word2Vec(sentences, vector_size=emb_size, window=5, min_count=2, sg=0, hs=0, workers=1, seed=1, epochs=30)
    emb_matrix = []
    for seq in tqdm(sentences):
        vec = []
        for w in seq:
            if w in model.wv:
                vec.append(model.wv.get_vector(w))
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    for i in range(emb_size):
        df[f'{f}_w2v_{i}'] = emb_matrix[:150000, i]

In [None]:
#特征构建---Tfidf + svd
size_dict = {'page_id_list':16}
TfidfVectorizer_feats = []
for f in tqdm(['page_id_list']):
    tfidf = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (1,5))#word、char_wb
    # tfidf = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (1,7))#word、char_wb
    tf = tfidf.fit_transform(view.groupby('cust_wid')['page_id'].agg(list).apply(lambda x: ' '.join([str(i) for i in x])).values)
    decom = TruncatedSVD(n_components=size_dict[f], n_iter = 50, random_state=42)
    decom_fea = pd.DataFrame(decom.fit_transform(tf))
    
    decom_fea.columns = [f+f'_tfidf_svd_{i}' for i in range(size_dict[f])]
    
    TfidfVectorizer_feats += [f+f'_tfidf_svd_{i}' for i in range(size_dict[f])]
    df[[f+f'_tfidf_svd_{i}' for i in range(size_dict[f])]] = decom_fea[[f+f'_tfidf_svd_{i}' for i in range(size_dict[f])]].values
    del decom_fea
gc.collect()

In [None]:
LABEL = 'bilabel'
df_train = df[df['label'].notna()].reset_index(drop=True)
df_test  = df[df['label'].isna()].reset_index(drop=True)

df_train['bilabel'] = df_train['label'].apply(lambda x:0 if x==0 else 1)
feats = [f for f in df_train.columns if f not in ['cust_wid',LABEL,'label', 'oof']]
len(feats)

In [None]:
df_train, df_test, imp = train_model(df_train=df_train, stage=1, test_mode=True)

In [None]:
#查看特征重要性
feats_importance = pd.DataFrame()
feats_importance['col_name'] = feats
feats_importance['importance'] = imp
print(feats_importance.sort_values('importance', ascending=False)[:40])