In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
import os
warnings.filterwarnings('ignore')

### 读取数据

这里依次获取主办方提供的数据。为了后面做特征方便，增加了mon和season字段，并且当读取测试集时对mon和season进行了特别的处理，保证了测试集发生时间在训练集之后。

In [2]:
y_Q3_3 = pd.read_csv('y_train_3/y_Q3_3.csv')
y_Q4_3 = pd.read_csv('y_train_3/y_Q4_3.csv')

In [3]:
aum_fils = os.listdir('x_train/aum_train/')+os.listdir('x_test/aum_test/')
aum = []
for f in aum_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/aum_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/aum_test/'+f)
        tmp['mon'] = mon+12
    aum.append(tmp)
aum = pd.concat(aum, axis=0, ignore_index=True)

aum_m10.csv
aum_m11.csv
aum_m12.csv
aum_m7.csv
aum_m8.csv
aum_m9.csv
aum_m1.csv
aum_m2.csv
aum_m3.csv


In [4]:
behavior_fils = os.listdir('x_train/behavior_train/')+os.listdir('x_test/behavior_test/')
behavior = []
for f in behavior_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/behavior_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/behavior_test/'+f)
        tmp['mon'] = mon+12
    behavior.append(tmp)
behavior = pd.concat(behavior, axis=0, ignore_index=True)

behavior_m10.csv
behavior_m11.csv
behavior_m12.csv
behavior_m7.csv
behavior_m8.csv
behavior_m9.csv
behavior_m1.csv
behavior_m2.csv
behavior_m3.csv


In [5]:
event_fils = os.listdir('x_train/big_event_train/')+os.listdir('x_test/big_event_test/')
event = []
for f in event_fils:
    print(f)
    season = int((f.split('.')[0]).split('_')[-1].replace('Q', ''))
    if season>=3:
        tmp = pd.read_csv('x_train/big_event_train/'+f)
    else:
        tmp = pd.read_csv('x_test/big_event_test/'+f)
    tmp['season'] = season
    event.append(tmp)
event = pd.concat(event, axis=0, ignore_index=True)

big_event_Q3.csv
big_event_Q4.csv
big_event_Q1.csv


In [6]:
cunkuan_fils = os.listdir('x_train/cunkuan_train/')+os.listdir('x_test/cunkuan_test/')
cunkuan = []
for f in cunkuan_fils:
    print(f)
    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))
    if mon>=7:
        tmp = pd.read_csv('x_train/cunkuan_train/'+f)
        tmp['mon'] = mon
    else:
        tmp = pd.read_csv('x_test/cunkuan_test/'+f)
        tmp['mon'] = mon+12
    cunkuan.append(tmp)
cunkuan = pd.concat(cunkuan, axis=0, ignore_index=True)

cunkuan_m10.csv
cunkuan_m11.csv
cunkuan_m12.csv
cunkuan_m7.csv
cunkuan_m8.csv
cunkuan_m9.csv
cunkuan_m1.csv
cunkuan_m2.csv
cunkuan_m3.csv


In [7]:
cust_avli_Q3 = pd.read_csv('x_train/cust_avli_Q3.csv')
cust_avli_Q4 = pd.read_csv('x_train/cust_avli_Q4.csv')
cust_info_Q3 = pd.read_csv('x_train/cust_info_Q3.csv')
cust_info_Q4 = pd.read_csv('x_train/cust_info_Q4.csv')

cust_avli_Q1 = pd.read_csv('x_test/cust_avli_Q1.csv')
cust_info_Q1 = pd.read_csv('x_test/cust_info_Q1.csv')

### 特征工程

In [8]:
train = y_Q4_3.copy()
test = cust_avli_Q1.copy()
train.shape, test.shape

((76170, 2), (76722, 1))

第一组特征很自然的想到用户历史的label，例如在预测季度4的用户时，使用用户在季度3的label作为特征。可以简单看到这个特征的kappa值可以达到0.238+。

In [9]:
y_Q3_3 = y_Q3_3.rename(columns={'label': 'bef_label'})
train = train.merge(y_Q3_3, on=['cust_no'], how='left')

y_Q4_3 = y_Q4_3.rename(columns={'label': 'bef_label'})
test = test.merge(y_Q4_3, on=['cust_no'], how='left')

In [10]:
cohen_kappa_score((train['label']+1), (train['bef_label'].fillna(1)+1))

0.23896181609901146

接下来可以拼接下用户的基础特征，这里我只是对一些类别变量做了LabelEncoder。

In [11]:
train = train.merge(cust_info_Q4, on=['cust_no'], how='left')
test = test.merge(cust_info_Q1, on=['cust_no'], how='left')

In [12]:
for col in [f for f in train.select_dtypes('object').columns if f not in ['label', 'cust_no']]:
    train[col].fillna('-1', inplace=True)
    test[col].fillna('-1', inplace=True)
    le = LabelEncoder()
    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [13]:
train.shape, test.shape

((76170, 23), (76722, 22))

这题最重要的应该是用户行为相关的数据，下面我们开始做一些简单的操作：
1. 用户当季度存款（cunkuan）的mean、max、min、std、sum、last的统计
2. 用户当季度最后一个月的aum数据
3. 用户当季度最后一个月的behavior数据
4. 用户当季度的event的特征，这里大多数都是时间，所以用该季度月末的后一天做时间差特征

In [14]:
cunkuan['C3'] = cunkuan['C1'] / cunkuan['C2']
cunkuan = cunkuan.sort_values(by=['cust_no', 'mon']).reset_index(drop=True)

agg_stat = {'C1': ['mean', 'max', 'min', 'std', 'sum', 'last'],
            'C2': ['mean', 'sum', 'min', 'max', 'std', 'last'],
            'C3': ['mean', 'max', 'min', 'std', 'sum', 'last']}
group_df = cunkuan[(cunkuan['mon']<=12)&(cunkuan['mon']>=10)].groupby(['cust_no']).agg(agg_stat)
group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]
group_df.reset_index(inplace=True)
train = train.merge(group_df, on=['cust_no'], how='left')

group_df = cunkuan[(cunkuan['mon']<=15)&(cunkuan['mon']>=13)].groupby(['cust_no']).agg(agg_stat)
group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]
group_df.reset_index(inplace=True)
test = test.merge(group_df, on=['cust_no'], how='left')

In [15]:
X_cols = [f for f in aum.columns if f.startswith('X')]
aum['X_sum'] = aum[X_cols].sum(axis=1)
aum['X_num'] = (aum[X_cols]>0).sum(axis=1)

In [16]:
X_cols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
tmp = aum[aum['mon']==12].copy()
del tmp['mon']
train = train.merge(tmp, on=['cust_no'], how='left')

tmp = aum[aum['mon']==15].copy()
del tmp['mon']
test = test.merge(tmp, on=['cust_no'], how='left')

In [17]:
behavior['B5-B3'] = behavior['B5'] - behavior['B3']
tmp = behavior[behavior['mon']==12].copy()
del tmp['mon']
train = train.merge(tmp, on=['cust_no'], how='left')

tmp = behavior[behavior['mon']==15].copy()
del tmp['mon']
test = test.merge(tmp, on=['cust_no'], how='left')

In [18]:
train['B6_gap'] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train['B6'])).dt.total_seconds()
test['B6_gap'] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test['B6'])).dt.total_seconds()

In [19]:
train['B6_hour'] = pd.to_datetime(train['B6']).dt.hour
test['B6_hour'] = pd.to_datetime(test['B6']).dt.hour

In [20]:
E_cols = [f for f in event.columns if f.startswith('E')]
event['event_num'] = len(E_cols) - event[E_cols].isnull().sum(axis=1)

tmp = event[event['season']==4].copy()
del tmp['season']
train = train.merge(tmp, on=['cust_no'], how='left')

tmp = event[event['season']==1].copy()
del tmp['season']
test = test.merge(tmp, on=['cust_no'], how='left')

In [21]:
for col in E_cols:
    if col not in ['E15', 'E17']:
        train[col] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train[col])).dt.days
        test[col] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test[col])).dt.days

### 模型训练

以上就构成了我们baseline的基础特征，下面开始训练模型。这里采用的是Lightgbm进行5折的多分类，早停直接使用kappa值。因为训练多分类时，目标值的最小值得是0，所以我们对原始label做+1的处理（记得提交的时候要改回来）。

In [22]:
def kappa(preds, train_data):
    y_true = train_data.label
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    score = cohen_kappa_score(y_true, preds)
    return 'kappa', score, True

def LGB_classfication_model(train, target, test, k):
    feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]
    print('Current num of features:', len(feats))
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)
    oof_preds = np.zeros(train.shape[0])
    oof_probs = np.zeros((train.shape[0], 3))
    output_preds = []
    feature_importance_df = pd.DataFrame()
    offline_score = []
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
        dtrain = lgb.Dataset(train_X,
                             label=train_y,
                            )
        dval = lgb.Dataset(test_X,
                           label=test_y)
        parameters = {
            'learning_rate': 0.05,
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': 'None',
            'num_leaves': 63,
            'num_class': 3,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'min_data_in_leaf': 20,
            'verbose': -1,
            'nthread': 12
        }
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round=5000,
            valid_sets=[dval],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=kappa,
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)
        oof_preds[test_index] = np.argmax(lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration), axis=1)
        offline_score.append(lgb_model.best_score['valid_0']['kappa'])
        output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))
        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-KAPPA score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(15))
    print('confusion matrix:')
    print(confusion_matrix(target, oof_preds))
    print('classfication report:')
    print(classification_report(target, oof_preds))

    return output_preds, oof_probs, np.mean(offline_score)

In [23]:
target = train['label'] + 1
lgb_preds, lgb_oof, lgb_score = LGB_classfication_model(train, target, test, 5)

Current num of features: 75
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.400189
[200]	valid_0's kappa: 0.408557
[300]	valid_0's kappa: 0.41303
[400]	valid_0's kappa: 0.414392
[500]	valid_0's kappa: 0.419225
[600]	valid_0's kappa: 0.416689
Early stopping, best iteration is:
[504]	valid_0's kappa: 0.421187
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.389094
[200]	valid_0's kappa: 0.399255
[300]	valid_0's kappa: 0.400023
[400]	valid_0's kappa: 0.403373
Early stopping, best iteration is:
[375]	valid_0's kappa: 0.40341
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.396125
[200]	valid_0's kappa: 0.408138
[300]	valid_0's kappa: 0.409113
[400]	valid_0's kappa: 0.41285
[500]	valid_0's kappa: 0.417589
Early stopping, best iteration is:
[499]	valid_0's kappa: 0.417908
Training until validation scores don't improve for 100 rounds
[100]	valid_0's kappa: 0.406317
[200]	valid_0'

### 线上提交

In [24]:
sub_df = test[['cust_no']].copy()
sub_df['label'] = np.argmax(np.mean(lgb_preds, axis=0), axis=1) - 1
sub_df['label'].value_counts(normalize=True)

 1    0.754542
-1    0.123107
 0    0.122351
Name: label, dtype: float64

In [25]:
sub_df.to_csv('baseline_sub.csv', index=False)

In [26]:
sub_df.head()

Unnamed: 0,cust_no,label
0,0x3b9b4615,0
1,0x3b9ae61b,1
2,0x3b9add69,0
3,0x3b9b3601,1
4,0x3b9b2599,0
