In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from tqdm import tqdm

pd.options.mode.chained_assignment = None

In [2]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', 100)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

In [3]:
train_base = pd.read_csv('data/train_base.csv')
test_a_base = pd.read_csv('data/test_a_base.csv')

train_op = pd.read_csv('data/train_op.csv')
test_a_op = pd.read_csv('data/test_a_op.csv')

train_trans = pd.read_csv('data/train_trans.csv')
test_a_trans = pd.read_csv('data/test_a_trans.csv')

train_label = pd.read_csv('data/train_label.csv')

In [127]:
train_base = pd.read_csv('data/train_base.csv')
train_base = pd.merge(train_base, train_label, on='user', how='left')

# base

### 填充nan，去除user列

In [128]:
train_base_no_nan = train_base.drop(columns=['service3_level'],axis=1)
# 离散值填充众数
train_base_no_nan['sex'].fillna('category 0',inplace=True)
train_base_no_nan['balance_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance1_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance2_avg'].fillna('level 1',inplace=True)

user = train_base_no_nan['user'].values
train_base_no_user = train_base_no_nan.drop(columns=['user'],axis=1)

In [137]:
train_base = train_base_no_user
df_base = pd.DataFrame()

### one-hot

In [138]:
# 年龄根据风险比例分箱
age_ratio = train_base.groupby('age')['label'].sum()/train_base.groupby('age')['label'].count()

age_1 = age_ratio[age_ratio>0.4].index.values

tmp_age = age_ratio[age_ratio<0.4]
age_2 = tmp_age[tmp_age>0.3].index.values

tmp_age = age_ratio[age_ratio<0.3]
age_3 = tmp_age[tmp_age>0.2].index.values

tmp_age = age_ratio[age_ratio<0.2]
age_4 = tmp_age[tmp_age>0.1].index.values

age_5 = age_ratio[age_ratio<0.1].index.values

def age_class(age):
    if age in age_1:
        return '0'
    elif age in age_2:
        return '1'
    elif age in age_3:
        return '2'
    elif age in age_4:
        return '3'
    elif age in age_5:
        return '4'
    else:
        return '5'
    
train_base['age_class']=train_base['age'].apply(age_class)

In [139]:
one_hot_df = pd.DataFrame()
category_columns = ['sex','age_class','provider','level','verified','regist_type',
                    'agreement1','agreement2','agreement3','agreement4',
                    'service3',
                    'product1_amount','product2_amount','product3_amount',
                    'product4_amount','product5_amount','product6_amount',
                    'balance','balance_avg','balance1','balance1_avg','balance2','balance2_avg']
one_hot_df = pd.get_dummies(train_base[category_columns])

In [140]:
one_hot_df['agreement_all_0'] = one_hot_df['agreement1_category 0'].values & one_hot_df['agreement2_category 0'].values & \
                                 one_hot_df['agreement3_category 0'].values & one_hot_df['agreement4_category 0'].values
one_hot_df['agreement_all_1'] = one_hot_df['agreement1_category 1'].values & one_hot_df['agreement2_category 1'].values & \
                                 one_hot_df['agreement3_category 1'].values & one_hot_df['agreement4_category 1'].values

In [141]:
one_hot_df['product_all_low'] = one_hot_df['product1_amount_level 1'].values & one_hot_df['product2_amount_level 1'].values & \
                                 one_hot_df['product3_amount_level 1'].values & one_hot_df['product4_amount_level 0'].values & \
                                one_hot_df['product5_amount_level 0'].values & one_hot_df['product6_amount_level 1'].values

one_hot_df['agreement_all_high'] = one_hot_df['product1_amount_level 6'].values & one_hot_df['product2_amount_level 21'].values & \
                                 one_hot_df['product3_amount_level 3'].values & one_hot_df['product4_amount_level 1'].values & \
                                    one_hot_df['product5_amount_level 0'].values & one_hot_df['product6_amount_level 21'].values

In [148]:
one_hot_df['balance_low'] = one_hot_df['balance_level 1'].values & one_hot_df['balance1_level 1'].values & \
                                 one_hot_df['balance2_level 1'].values

one_hot_df['balance_high'] = one_hot_df['balance_level 21'].values & one_hot_df['balance1_level 21'].values & \
                                 one_hot_df['balance2_level 21'].values

In [151]:
one_hot_df.shape

(47782, 203)

In [None]:
city_mean = joblib.load('city_mean')[0]
province_mean = joblib.load('province_mean')[0]

In [None]:
def magic_feature(df, f1, f2):
    df[f'{f1}_{f2}_a'] = df[f1]+df[f2]
    df[f'{f1}_{f2}_s'] = df[f1]-df[f2]
    df[f'{f1}_{f2}_m'] = df[f1]*df[f2]
    df[f'{f1}_{f2}_d'] = df[f1]/df[f2]

### 连续变量的处理

In [152]:
df_value = train_base_no_user.select_dtypes('int64')

In [153]:
df_value['product7_success_cnt'] = df_value['product7_cnt']-df_value['product7_fail_cnt']

df_value['card_cnt'] = df_value['card_a_cnt']+df_value['card_b_cnt']+df_value['card_c_cnt']+df_value['card_d_cnt']

df_value['ip_cnt_avg'] = df_value['ip_cnt']/df_value['login_days_cnt']

df_value['login_cnt_period1_avg']=df_value['login_cnt_period1']/df_value['login_days_cnt']
df_value['login_cnt_period2_avg']=df_value['login_cnt_period2']/df_value['login_days_cnt']
df_value['login_cnt_period']=df_value['login_cnt_period1']+df_value['login_cnt_period2']
df_value['login_cnt_period_avg']=df_value['login_cnt_period']/df_value['login_days_cnt']

df_value['service_cnt']=df_value['service1_cnt']+df_value['service2_cnt']
df_value['service_avg1_amt']=df_value['service1_amt']/df_value['service1_cnt']

df_value['op_cnt']=df_value['op1_cnt']+df_value['op2_cnt']

In [156]:
# 归一化
df_value = (df_value-df_value.min())/(df_value.max()-df_value.min())

df_base = pd.DataFrame()
df_base['user'] = user
df_base = pd.concat([df_base, one_hot_df, df_value],axis=1)

In [157]:
print(df_base.shape)
df_base.head()

(47782, 235)


Unnamed: 0,user,sex_category 0,sex_category 1,age_class_0,age_class_1,age_class_2,age_class_3,age_class_4,provider_category 0,provider_category 1,provider_category 2,level_category 0,level_category 1,level_category 2,verified_category 0,verified_category 1,regist_type_category 0,regist_type_category 1,regist_type_category 2,regist_type_category 3,regist_type_category 4,regist_type_category 5,regist_type_category 6,regist_type_category 7,agreement1_category 0,agreement1_category 1,agreement2_category 0,agreement2_category 1,agreement3_category 0,agreement3_category 1,agreement4_category 0,agreement4_category 1,service3_category 0,service3_category 1,product1_amount_level 1,product1_amount_level 2,product1_amount_level 3,product1_amount_level 4,product1_amount_level 5,product1_amount_level 6,product1_amount_level 7,product2_amount_level 1,product2_amount_level 10,product2_amount_level 11,product2_amount_level 12,product2_amount_level 13,product2_amount_level 14,product2_amount_level 15,product2_amount_level 16,product2_amount_level 17,product2_amount_level 18,product2_amount_level 19,product2_amount_level 2,product2_amount_level 20,product2_amount_level 21,product2_amount_level 3,product2_amount_level 4,product2_amount_level 5,product2_amount_level 6,product2_amount_level 7,product2_amount_level 8,product2_amount_level 9,product3_amount_level 1,product3_amount_level 2,product3_amount_level 3,product4_amount_level 0,product4_amount_level 1,product5_amount_level 0,product5_amount_level 1,product6_amount_level 1,product6_amount_level 10,product6_amount_level 11,product6_amount_level 12,product6_amount_level 13,product6_amount_level 14,product6_amount_level 15,product6_amount_level 16,product6_amount_level 17,product6_amount_level 18,product6_amount_level 19,product6_amount_level 2,product6_amount_level 20,product6_amount_level 21,product6_amount_level 3,product6_amount_level 4,product6_amount_level 5,product6_amount_level 6,product6_amount_level 7,product6_amount_level 8,product6_amount_level 9,balance_level 1,balance_level 11,balance_level 12,balance_level 13,balance_level 14,balance_level 15,balance_level 16,balance_level 17,balance_level 18,balance_level 19,balance_level 2,balance_level 20,balance_level 21,balance_level 3,balance_level 4,balance_level 5,balance_level 6,balance_level 7,balance_level 8,balance_level 9,balance_avg_level 1,balance_avg_level 10,balance_avg_level 11,balance_avg_level 12,balance_avg_level 13,balance_avg_level 14,balance_avg_level 15,balance_avg_level 16,balance_avg_level 17,balance_avg_level 18,balance_avg_level 19,balance_avg_level 2,balance_avg_level 20,balance_avg_level 21,balance_avg_level 3,balance_avg_level 4,balance_avg_level 5,balance_avg_level 6,balance_avg_level 7,balance_avg_level 8,balance_avg_level 9,balance1_level 1,balance1_level 10,balance1_level 11,balance1_level 12,balance1_level 13,balance1_level 14,balance1_level 15,balance1_level 16,balance1_level 17,balance1_level 18,balance1_level 19,balance1_level 2,balance1_level 20,balance1_level 21,balance1_level 3,balance1_level 4,balance1_level 5,balance1_level 6,balance1_level 7,balance1_level 8,balance1_level 9,balance1_avg_level 1,balance1_avg_level 10,balance1_avg_level 11,balance1_avg_level 12,balance1_avg_level 13,balance1_avg_level 14,balance1_avg_level 15,balance1_avg_level 16,balance1_avg_level 17,balance1_avg_level 18,balance1_avg_level 19,balance1_avg_level 2,balance1_avg_level 20,balance1_avg_level 21,balance1_avg_level 3,balance1_avg_level 4,balance1_avg_level 5,balance1_avg_level 6,balance1_avg_level 7,balance1_avg_level 8,balance1_avg_level 9,balance2_level 1,balance2_level 10,balance2_level 12,balance2_level 13,balance2_level 14,balance2_level 15,balance2_level 16,balance2_level 17,balance2_level 18,balance2_level 19,balance2_level 2,balance2_level 20,balance2_level 21,balance2_level 3,balance2_level 4,balance2_level 5,balance2_level 6,balance2_level 7,balance2_level 8,balance2_level 9,balance2_avg_level 1,balance2_avg_level 2,balance2_avg_level 3,balance2_avg_level 4,balance2_avg_level 5,agreement_all_0,agreement_all_1,product_all_low,agreement_all_high,balance_low,balance_high,age,using_time,card_a_cnt,card_b_cnt,card_c_cnt,op1_cnt,op2_cnt,card_d_cnt,agreement_total,service1_cnt,service1_amt,service2_cnt,acc_count,login_cnt_period1,login_cnt_period2,ip_cnt,login_cnt_avg,login_days_cnt,product7_cnt,product7_fail_cnt,label,product7_success_cnt,card_cnt,ip_cnt_avg,login_cnt_period1_avg,login_cnt_period2_avg,login_cnt_period,login_cnt_period_avg,service_cnt,service_avg1_amt,op_cnt
0,Train_06800,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0.235862,0.415094,0.057692,0.098361,0.0,0.091346,0.027397,0.0,0.203252,0.0,0.0,0.0,0.003762,0.004402,0.003142,0.01015,0.021058,0.078467,0.0,0.0,0.0,0.0,0.057692,0.225496,0.004135,0.002801,0.00381,0.003509,0.0,1.7e-05,0.110132
1,Train_23487,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0.26069,0.245283,0.125,0.213115,0.0,0.0625,0.027397,0.0,0.252033,0.0,0.0,0.0,0.001736,0.004759,0.004061,0.045406,0.012243,0.167883,0.04878,0.0,1.0,0.857143,0.125,0.235239,0.004076,0.003265,0.004431,0.003695,0.0,1.7e-05,0.0837
2,Train_36880,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0.202759,0.320755,0.125,0.213115,0.0,0.028846,0.027397,0.0,0.349593,0.0,0.0,0.0,0.005498,0.019068,0.010166,0.123932,0.018119,0.322993,0.04878,0.0,1.0,0.857143,0.125,0.270565,0.017896,0.008655,0.014887,0.013557,0.0,1.7e-05,0.052863
3,Train_35392,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0.32,0.018868,0.057692,0.098361,0.0,0.0,0.0,0.0,0.203252,0.0,0.0,0.0,0.005498,0.003347,0.003779,0.049145,0.012243,0.155109,0.04878,0.0,0.0,0.857143,0.057692,0.243241,0.002696,0.003046,0.00355,0.00286,0.0,1.7e-05,0.0
4,Train_35057,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0.344828,0.415094,0.125,0.098361,0.070588,0.028846,0.027397,0.0,0.349593,0.0,0.0,0.0,0.001736,0.003646,0.001435,0.032585,0.002938,0.155109,0.04878,0.0,1.0,0.857143,0.120192,0.225519,0.003001,0.000651,0.002607,0.001898,0.0,1.7e-05,0.052863


## OP处理

In [158]:
op_df = pd.DataFrame()
group = train_op.groupby(['user']).count()
op_df['user'] = group.index

In [159]:
train_op.head()

Unnamed: 0,user,op_type,op_mode,op_device,ip,net_type,channel,ip_3,tm_diff
0,Train_36517,b26bc49195bd79cf,87ee0bdf333a54da,92dc8b3f9a9ea13b,bbb0af60b941370b,116a2503b987ea81,4e1ff124e1e6adc8,1e46c177cd9d539a,11 days 09:38:22.000000000
1,Train_36517,b26bc49195bd79cf,87ee0bdf333a54da,92dc8b3f9a9ea13b,bbb0af60b941370b,116a2503b987ea81,4e1ff124e1e6adc8,1e46c177cd9d539a,11 days 09:38:21.000000000
2,Train_36517,b26bc49195bd79cf,87ee0bdf333a54da,92dc8b3f9a9ea13b,bbb0af60b941370b,116a2503b987ea81,4e1ff124e1e6adc8,1e46c177cd9d539a,11 days 09:38:23.000000000
3,Train_36517,b26bc49195bd79cf,87ee0bdf333a54da,92dc8b3f9a9ea13b,bbb0af60b941370b,116a2503b987ea81,4e1ff124e1e6adc8,1e46c177cd9d539a,11 days 09:38:26.000000000
4,Train_36517,b26bc49195bd79cf,87ee0bdf333a54da,92dc8b3f9a9ea13b,bbb0af60b941370b,116a2503b987ea81,4e1ff124e1e6adc8,1e46c177cd9d539a,11 days 09:38:41.000000000


In [160]:
count_column=train_op.columns[1:-1]
print(count_column)
for column in count_column:
    op_df['op_'+column+'_count'] = train_op.groupby('user')[column].count().values
    op_df['op_'+column+'_nunique'] = train_op.groupby('user')[column].nunique().values

Index(['op_type', 'op_mode', 'op_device', 'ip', 'net_type', 'channel', 'ip_3'], dtype='object')


In [161]:
from datetime import timedelta
def timedelta2sec(delta):
    day = delta.split('days')[0].strip()
    h,m,s = delta.split('days')[1].strip().split(':')
    sec=timedelta(days=int(day),hours=int(h),minutes=int(m),seconds=float(s)).total_seconds()
    return sec

train_op['time_diff_sec'] = train_op['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    op_df['op_time_'+operate]=train_op.groupby('user')['time_diff_sec'].agg(operate).values

In [162]:
op_df.fillna(0,inplace=True)
print(op_df.shape)
op_df.head()

(41892, 20)


Unnamed: 0,user,op_op_type_count,op_op_type_nunique,op_op_mode_count,op_op_mode_nunique,op_op_device_count,op_op_device_nunique,op_ip_count,op_ip_nunique,op_net_type_count,op_net_type_nunique,op_channel_count,op_channel_nunique,op_ip_3_count,op_ip_3_nunique,op_time_max,op_time_min,op_time_mean,op_time_median,op_time_std
0,Train_00000,102,11,102,12,91,2,90,3,30,3,102,4,90,3,683286.0,219874.0,305801.4,263361.0,125051.535485
1,Train_00001,18,3,18,3,18,1,18,2,12,2,18,3,18,2,1263768.0,409809.0,659804.8,438553.5,385614.886428
2,Train_00002,8,3,8,3,8,1,8,1,7,2,8,3,8,1,1283043.0,1282741.0,1282845.0,1282751.5,139.05369
3,Train_00003,108,8,108,8,93,2,93,18,50,2,108,3,93,5,1278388.0,199014.0,940869.0,957412.0,247893.797669
4,Train_00004,5,2,5,2,5,1,5,1,4,1,5,2,5,1,631021.0,630973.0,630986.6,630975.0,20.659138


## 交易处理

In [163]:
trans_df = pd.DataFrame()
group = train_trans.groupby(['user']).count()
trans_df['user'] = group.index

### 离散列，全部统计有几个类别

In [164]:
count_column = list(train_trans.columns[1:4])+list(train_trans.columns[5:9]) 
for column in count_column:
    print(column)
    trans_df['trans_'+column+'_count'] = train_trans.groupby('user')[column].count().values
    trans_df['trans_'+column+'_nunique'] = train_trans.groupby('user')[column].nunique().values
#     des_df = train_trans.groupby('user')[column].describe()
#     trans_df['trans_'+column+'_count'] = des_df['count'].values
#     trans_df['trans_'+column+'_n'] = des_df['unique'].values
#     trans_df['trans_'+column+'_fre'] = des_df['freq'].values

platform
tunnel_in
tunnel_out
type1
ip
type2
ip_3


### 处理amount

In [165]:
for operate in ['max','min','mean','median','std']:
    trans_df['trans_amount_'+operate]=train_trans.groupby(['user'])['amount'].agg(operate).values

### 处理时间

In [166]:
train_trans['time_diff_sec'] = train_trans['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    trans_df['trans_time_'+operate]=train_trans.groupby('user')['time_diff_sec'].agg(operate).values

### 标准差、freq有空值，用0填充

In [167]:
trans_df.fillna(0,inplace=True)

In [168]:
trans_df.head()

Unnamed: 0,user,trans_platform_count,trans_platform_nunique,trans_tunnel_in_count,trans_tunnel_in_nunique,trans_tunnel_out_count,trans_tunnel_out_nunique,trans_type1_count,trans_type1_nunique,trans_ip_count,trans_ip_nunique,trans_type2_count,trans_type2_nunique,trans_ip_3_count,trans_ip_3_nunique,trans_amount_max,trans_amount_min,trans_amount_mean,trans_amount_median,trans_amount_std,trans_time_max,trans_time_min,trans_time_mean,trans_time_median,trans_time_std
0,Train_00000,13,2,7,1,13,1,13,4,5,2,7,1,5,2,267542,24798,53330.307692,30746.0,65274.138488,2478067.0,1169773.0,1553383.0,1627246.0,397032.0
1,Train_00001,2,2,1,1,2,1,2,2,1,1,1,1,1,1,36098,36098,36098.0,36098.0,0.0,1821027.0,289554.0,1055290.0,1055290.5,1082915.0
2,Train_00002,12,2,7,1,12,1,12,3,6,3,7,1,6,3,162423,24980,57329.583333,34541.5,49227.699521,2665430.0,575019.0,1454428.0,625491.0,1069315.0
3,Train_00003,11,2,9,1,11,1,11,5,8,2,9,1,8,2,188880,25315,61652.454545,36689.0,62929.509668,2563665.0,830115.0,2243363.0,2562619.0,697684.9
4,Train_00004,1,1,0,0,1,1,1,1,0,0,0,0,0,0,36689,36689,36689.0,36689.0,0.0,2014022.0,2014022.0,2014022.0,2014022.0,0.0


In [169]:
trans_df.shape

(41560, 25)

## 数据合并

In [170]:
# train_df = df_base
train_df = pd.merge(df_base, op_df, on='user', how='left')
train_df = pd.merge(train_df, trans_df, on='user', how='left')

train=train_df

In [171]:
train.shape

(47782, 278)

In [172]:
length = len(train)
train_length = int(0.9*length)

x = train.drop(columns=['user','label'],axis=1)[0:train_length]
y = train['label'].values[0:train_length]
valid_x = train.drop(columns=['user','label'],axis=1)[train_length:]
valid_y = train['label'].values[train_length:]

x = x.fillna(0)
valid_x = valid_x.fillna(0)

In [180]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# model = LogisticRegression()
# model.fit(x,y)

model = LGBMClassifier(n_estimators=1000, 
                       learning_rate=0.045,
                       subsample=0.8,
                       colsample_bytree=0.8)
model.fit(x, y,
          eval_set=(valid_x, valid_y),
          early_stopping_rounds=5
          )

prediction = model.predict_proba(valid_x)[:,1]
auc = roc_auc_score(valid_y,prediction)
auc

[1]	valid_0's binary_logloss: 0.562218
Training until validation scores don't improve for 5 rounds
[2]	valid_0's binary_logloss: 0.55987
[3]	valid_0's binary_logloss: 0.55772
[4]	valid_0's binary_logloss: 0.55623
[5]	valid_0's binary_logloss: 0.554549
[6]	valid_0's binary_logloss: 0.552874
[7]	valid_0's binary_logloss: 0.55134
[8]	valid_0's binary_logloss: 0.550059
[9]	valid_0's binary_logloss: 0.548946
[10]	valid_0's binary_logloss: 0.547779
[11]	valid_0's binary_logloss: 0.54668
[12]	valid_0's binary_logloss: 0.545885
[13]	valid_0's binary_logloss: 0.544938
[14]	valid_0's binary_logloss: 0.544092
[15]	valid_0's binary_logloss: 0.543386
[16]	valid_0's binary_logloss: 0.542758
[17]	valid_0's binary_logloss: 0.542278
[18]	valid_0's binary_logloss: 0.541726
[19]	valid_0's binary_logloss: 0.541274
[20]	valid_0's binary_logloss: 0.540793
[21]	valid_0's binary_logloss: 0.540191
[22]	valid_0's binary_logloss: 0.539727
[23]	valid_0's binary_logloss: 0.539341
[24]	valid_0's binary_logloss: 0.5

0.6768534596844231

In [None]:
joblib.dump(model,'model.pkl')

In [None]:
pd.DataFrame({'column': x.columns,'importance': np.abs(model.coef_[0])}).sort_values(by='importance')[-100:]

In [None]:
pd.DataFrame({'column': x.columns,'importance': model.feature_importances_}).sort_values(by='importance')[-100:]

In [None]:
# select_feature=importance_df['column']

# x = x[select_feature]
# valid_x = valid_x[select_feature]

In [182]:
# from catboost import CatBoostClassifier


# model = CatBoostClassifier(
#     iterations=500,
#     random_seed=42,
#     logging_level='Silent'
# )

# model.fit(
#     x.values, y,
#     eval_set=(valid_x.values, valid_y),
# #     logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
# )

# prediction = model.predict_proba(valid_x)[:,1]
# auc = roc_auc_score(valid_y,prediction)
# auc

baseline:
0.6769352059269422

添加trans amount的扩展变量：
0.6782946345739088

添加trans ip3的扩展变量：
0.6791192236191647

添加trans type直接数值化：
0.6796788342378327

添加所有的des特征，并修改了学习率：
0.6823851869318595

添加avg特征：
0.6831416556804824