In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from tqdm import tqdm

pd.options.mode.chained_assignment = None

In [2]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', 500)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',500)

In [3]:
train_base = pd.read_csv('data/train_base.csv')
test_a_base = pd.read_csv('data/test_a_base.csv')

train_op = pd.read_csv('data/train_op.csv')
test_a_op = pd.read_csv('data/test_a_op.csv')

train_trans = pd.read_csv('data/train_trans.csv')
test_a_trans = pd.read_csv('data/test_a_trans.csv')

train_label = pd.read_csv('data/train_label.csv')

In [4]:
train_base = pd.read_csv('data/train_base.csv')
train_base = pd.merge(train_base, train_label, on='user', how='left')

# base

In [5]:
# 填充nan，去除user列
train_base_no_nan = train_base.drop(columns=['service3_level'],axis=1)
test_a_base_no_nan = test_a_base.drop(columns=['service3_level'],axis=1)
# 离散值填充众数
train_base_no_nan['sex'].fillna('category 0',inplace=True)
train_base_no_nan['balance_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance1_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance2_avg'].fillna('level 1',inplace=True)

test_a_base_no_nan['sex'].fillna('category 0',inplace=True)
test_a_base_no_nan['balance_avg'].fillna('level 1',inplace=True)
test_a_base_no_nan['balance1_avg'].fillna('level 1',inplace=True)
test_a_base_no_nan['balance2_avg'].fillna('level 1',inplace=True)

user = train_base_no_nan['user'].values
test_a_user = test_a_base_no_nan['user'].values

train_base_no_user = train_base_no_nan.drop(columns=['user'],axis=1)
test_a_base_no_user = test_a_base_no_nan.drop(columns=['user'],axis=1)

In [6]:
train_base = train_base_no_user
df_base = pd.DataFrame()

test_a_base = test_a_base_no_user
df_test_a_base = pd.DataFrame()

In [7]:
# 年龄根据风险比例分箱
age_ratio = train_base.groupby('age')['label'].sum()/train_base.groupby('age')['label'].count()

age_1 = age_ratio[age_ratio>0.4].index.values

tmp_age = age_ratio[age_ratio<0.4]
age_2 = tmp_age[tmp_age>0.3].index.values

tmp_age = age_ratio[age_ratio<0.3]
age_3 = tmp_age[tmp_age>0.2].index.values

tmp_age = age_ratio[age_ratio<0.2]
age_4 = tmp_age[tmp_age>0.1].index.values

age_5 = age_ratio[age_ratio<0.1].index.values

def age_class(age):
    if age in age_1:
        return '0'
    elif age in age_2:
        return '1'
    elif age in age_3:
        return '2'
    elif age in age_4:
        return '3'
    elif age in age_5:
        return '4'
    else:
        return '4'
    
train_base['age_class']=train_base['age'].apply(age_class)
test_a_base['age_class']=test_a_base['age'].apply(age_class)

In [8]:
# 城市根据风险比例分箱
city_ratio = train_base.groupby('city')['label'].sum()/train_base.groupby('city')['label'].count()

city_1 = city_ratio[city_ratio>0.4].index.values

tmp_city = city_ratio[city_ratio<0.4]
city_2 = tmp_city[tmp_city>0.3].index.values

tmp_city = city_ratio[city_ratio<0.3]
city_3 = tmp_city[tmp_city>0.2].index.values

tmp_city = city_ratio[city_ratio<0.2]
city_4 = tmp_city[tmp_city>0.1].index.values

city_5 = city_ratio[city_ratio<0.1].index.values

def city_class(city):
    if city in city_1:
        return '0'
    elif city in city_2:
        return '1'
    elif city in city_3:
        return '2'
    elif city in city_4:
        return '3'
    elif city in city_5:
        return '4'
    else:
        return '5'

train_base['city_class']=train_base['city'].apply(city_class)
test_a_base['city_class']=test_a_base['city'].apply(city_class)

In [9]:
# # balance 分享
# balance_ratio = train_base.groupby('balance')['label'].sum()/train_base.groupby('balance')['label'].count()

# balance_1 = balance_ratio[balance_ratio>0.29].index.values

# tmp_balance = balance_ratio[balance_ratio<0.29]
# balance_2 = tmp_balance[tmp_balance>0.19].index.values

# balance_3 = balance_ratio[balance_ratio<0.19]

# def balance_class(balance):
#     if balance in balance_1:
#         return '0'
#     elif balance in balance_2:
#         return '1'
#     elif balance in balance_3:
#         return '2'
    
# train_base['balance_class']=train_base['balance'].apply(balance_class)
# test_a_base['balance_class']=test_a_base['balance'].apply(balance_class)

# # balance1
# balance_ratio = train_base.groupby('balance1')['label'].sum()/train_base.groupby('balance1')['label'].count()
# balance_1 = balance_ratio[balance_ratio>0.27].index.values

# tmp_balance = balance_ratio[balance_ratio<0.27]
# balance_2 = tmp_balance[tmp_balance>0.15].index.values

# balance_3 = balance_ratio[balance_ratio<0.1]

# def balance_class(balance):
#     if balance in balance_1:
#         return '0'
#     elif balance in balance_2:
#         return '1'
#     elif balance in balance_3:
#         return '2'
    
# train_base['balance1_class']=train_base['balance1'].apply(balance_class)
# test_a_base['balance1_class']=test_a_base['balance1'].apply(balance_class)

# # balance2
# balance_ratio = train_base.groupby('balance2')['label'].sum()/train_base.groupby('balance2')['label'].count()
# balance_1 = balance_ratio[balance_ratio>0.24].index.values

# tmp_balance = balance_ratio[balance_ratio<0.24]
# balance_2 = tmp_balance[tmp_balance>0.21].index.values

# balance_3 = balance_ratio[balance_ratio<0.21]

# def balance_class(balance):
#     if balance in balance_1:
#         return '0'
#     elif balance in balance_2:
#         return '1'
#     elif balance in balance_3:
#         return '2'
    
# train_base['balance2_class']=train_base['balance2'].apply(balance_class)
# test_a_base['balance2_class']=test_a_base['balance2'].apply(balance_class)

# # balance_avg
# balance_ratio = train_base.groupby('balance_avg')['label'].sum()/train_base.groupby('balance_avg')['label'].count()

# balance_1 = balance_ratio[balance_ratio>0.31].index.values

# tmp_balance = balance_ratio[balance_ratio<0.31]
# balance_2 = tmp_balance[tmp_balance>0.3].index.values

# tmp_balance = balance_ratio[balance_ratio<0.3]
# balance_3 = tmp_balance[balance_ratio>0.18]

# balance_4 = tmp_balance[balance_ratio<0.18]

# def balance_class(balance):
#     if balance in balance_1:
#         return '0'
#     elif balance in balance_2:
#         return '1'
#     elif balance in balance_3:
#         return '2'
#     elif balance in balance_4:
#         return '3'

# train_base['balance_avg_class']=train_base['balance_avg'].apply(balance_class)
# test_a_base['balance_avg_class']=test_a_base['balance_avg'].apply(balance_class)

# # balance1_avg
# balance_ratio = train_base.groupby('balance1_avg')['label'].sum()/train_base.groupby('balance1_avg')['label'].count()

# balance_1 = balance_ratio[balance_ratio>0.28].index.values

# tmp_balance = balance_ratio[balance_ratio<0.28]
# balance_2 = tmp_balance[tmp_balance>0.1].index.values

# balance_3 = balance_ratio[balance_ratio<0.1]

# def balance_class(balance):
#     if balance in balance_1:
#         return '0'
#     elif balance in balance_2:
#         return '1'
#     elif balance in balance_3:
#         return '2'

# train_base['balance1_avg_class']=train_base['balance1_avg'].apply(balance_class)
# test_a_base['balance1_avg_class']=test_a_base['balance1_avg'].apply(balance_class)

In [10]:
# # product_amount 分箱
# # product2_amount
# ratio = train_base.groupby('product2_amount')['label'].sum()/train_base.groupby('product2_amount')['label'].count()

# fold1 = ratio[ratio>0.2].index.values

# tmp_fold = ratio[ratio<0.2]
# fold2 = tmp_fold[tmp_fold>0.1].index.values

# fold3 = ratio[ratio<0.1]

# def fold_class(fold):
#     if fold in fold1:
#         return '0'
#     elif fold in fold_2:
#         return '1'
#     elif fold in fold_3:
#         return '2'

# train_base['product2_amount_class']=train_base['product2_amount'].apply(balance_class)
# test_a_base['product2_amount_class']=test_a_base['product2_amount'].apply(balance_class)

# # product6_amount
# ratio = train_base.groupby('product6_amount')['label'].sum()/train_base.groupby('product6_amount')['label'].count()

# fold1 = ratio[ratio>0.28].index.values

# tmp_fold = ratio[ratio<0.28]
# fold2 = tmp_fold[tmp_fold>0.2].index.values

# tmp_fold = ratio[ratio<0.2]
# fold3 = tmp_fold[tmp_fold>0.1].index.values

# fold4 = ratio[ratio<0.1]

# def fold_class(fold):
#     if fold in fold1:
#         return '0'
#     elif fold in fold_2:
#         return '1'
#     elif fold in fold_3:
#         return '2'
#     elif fold in fold_4:
#         return '3'

# train_base['product6_amount_class']=train_base['product6_amount'].apply(balance_class)
# test_a_base['product6_amount_class']=test_a_base['product6_amount'].apply(balance_class)

In [11]:
# 省份选top5
a = train_base.groupby('province')['label'].sum()/train_base.groupby('province')['label'].count()
a = a.sort_values(ascending=False)[0:5]
top_province = a.index.tolist()

train_base['top_province'] = train_base['province'].apply(lambda x:str(top_province.index(x)) if x in top_province else '-1')
test_a_base['top_province'] = test_a_base['province'].apply(lambda x:str(top_province.index(x)) if x in top_province else '-1')

In [12]:
one_hot_df = pd.DataFrame()
category_columns = ['sex','age_class','provider','level','verified','regist_type',
                    'agreement1','agreement2','agreement3','agreement4',
                    'service3',
#                     'product1_amount','product3_amount',
#                     'product4_amount','product5_amount',
#                     'product2_amount','product6_amount',
#                     'product2_amount_class','product6_amount_class',
                    
#                     'balance_class','balance1_class','balance2_class',
                    # ---
#                     'balance','balance1','balance2',
#                     'balance_avg','balance1_avg',
#                     'balance_avg_class','balance1_avg_class',
                    # ---
                    'balance2_avg',
                    'city_class','top_province']
one_hot_df = pd.get_dummies(train_base[category_columns])
test_a_one_hot_df = pd.get_dummies(test_a_base[category_columns])

In [13]:
# 交叉特征
# one_hot_df['agreement_all_0'] = one_hot_df['agreement1_category 0'].values & one_hot_df['agreement2_category 0'].values & \
#                                  one_hot_df['agreement3_category 0'].values & one_hot_df['agreement4_category 0'].values
# one_hot_df['agreement_all_1'] = one_hot_df['agreement1_category 1'].values & one_hot_df['agreement2_category 1'].values & \
#                                  one_hot_df['agreement3_category 1'].values & one_hot_df['agreement4_category 1'].values

# one_hot_df['product_all_low'] = one_hot_df['product1_amount_level 1'].values & one_hot_df['product2_amount_level 1'].values & \
#                                  one_hot_df['product3_amount_level 1'].values & one_hot_df['product4_amount_level 0'].values & \
#                                 one_hot_df['product5_amount_level 0'].values & one_hot_df['product6_amount_level 1'].values
# one_hot_df['agreement_all_high'] = one_hot_df['product1_amount_level 6'].values & one_hot_df['product2_amount_level 21'].values & \
#                                  one_hot_df['product3_amount_level 3'].values & one_hot_df['product4_amount_level 1'].values & \
#                                     one_hot_df['product5_amount_level 0'].values & one_hot_df['product6_amount_level 21'].values

one_hot_df['agg_city_high'] = (one_hot_df['age_class_0'].values | one_hot_df['age_class_1'].values) & \
                              (one_hot_df['city_class_0'].values | one_hot_df['city_class_1'].values)

one_hot_df['city_province_high'] = (one_hot_df['top_province_0'].values | one_hot_df['top_province_1'].values) & \
                                    one_hot_df['city_class_0'].values

In [14]:
# 交叉特征
# test_a_one_hot_df['agreement_all_0'] = test_a_one_hot_df['agreement1_category 0'].values & test_a_one_hot_df['agreement2_category 0'].values & \
#                                  test_a_one_hot_df['agreement3_category 0'].values & test_a_one_hot_df['agreement4_category 0'].values
# test_a_one_hot_df['agreement_all_1'] = test_a_one_hot_df['agreement1_category 1'].values & test_a_one_hot_df['agreement2_category 1'].values & \
#                                  test_a_one_hot_df['agreement3_category 1'].values & test_a_one_hot_df['agreement4_category 1'].values

# test_a_one_hot_df['product_all_low'] = test_a_one_hot_df['product1_amount_level 1'].values & test_a_one_hot_df['product2_amount_level 1'].values & \
#                                  test_a_one_hot_df['product3_amount_level 1'].values & test_a_one_hot_df['product4_amount_level 0'].values & \
#                                 test_a_one_hot_df['product5_amount_level 0'].values & test_a_one_hot_df['product6_amount_level 1'].values
# test_a_one_hot_df['agreement_all_high'] = test_a_one_hot_df['product1_amount_level 6'].values & test_a_one_hot_df['product2_amount_level 21'].values & \
#                                  test_a_one_hot_df['product3_amount_level 3'].values & test_a_one_hot_df['product4_amount_level 1'].values & \
#                                     test_a_one_hot_df['product5_amount_level 0'].values & test_a_one_hot_df['product6_amount_level 21'].values

test_a_one_hot_df['agg_city_high'] = (test_a_one_hot_df['age_class_0'].values | test_a_one_hot_df['age_class_1'].values) & \
                              (test_a_one_hot_df['city_class_0'].values | test_a_one_hot_df['city_class_1'].values)

test_a_one_hot_df['city_province_high'] = (test_a_one_hot_df['top_province_0'].values | test_a_one_hot_df['top_province_1'].values) & \
                                    test_a_one_hot_df['city_class_0'].values

In [15]:
def magic_feature(df, f1, f2):
    df[f'{f1}_{f2}_a'] = df[f1]+df[f2]
    df[f'{f1}_{f2}_s'] = df[f1]-df[f2]
    df[f'{f1}_{f2}_m'] = df[f1]*df[f2]
    df[f'{f1}_{f2}_d'] = df[f1]/df[f2]

### 连续变量的处理

In [16]:
df_value = train_base_no_user.select_dtypes('int64')
test_a_df_value = test_a_base_no_user.select_dtypes('int64')

In [17]:
df_value['product7_success_cnt'] = df_value['product7_cnt']-df_value['product7_fail_cnt']

df_value['card_cnt'] = df_value['card_a_cnt']+df_value['card_b_cnt']+df_value['card_c_cnt']+df_value['card_d_cnt']

df_value['ip_cnt_avg'] = df_value['ip_cnt']/df_value['login_days_cnt']

df_value['login_cnt_period1_avg']=df_value['login_cnt_period1']/df_value['login_days_cnt']
df_value['login_cnt_period2_avg']=df_value['login_cnt_period2']/df_value['login_days_cnt']
df_value['login_cnt_period']=df_value['login_cnt_period1']+df_value['login_cnt_period2']
df_value['login_cnt_period_avg']=df_value['login_cnt_period']/df_value['login_days_cnt']

df_value['service_cnt']=df_value['service1_cnt']+df_value['service2_cnt']
df_value['service_avg1_amt']=df_value['service1_amt']/df_value['service1_cnt']

df_value['op_cnt']=df_value['op1_cnt']+df_value['op2_cnt']

In [18]:
test_a_df_value['product7_success_cnt'] = test_a_df_value['product7_cnt']-test_a_df_value['product7_fail_cnt']

test_a_df_value['card_cnt'] = test_a_df_value['card_a_cnt']+test_a_df_value['card_b_cnt']+test_a_df_value['card_c_cnt']+test_a_df_value['card_d_cnt']

test_a_df_value['ip_cnt_avg'] = test_a_df_value['ip_cnt']/test_a_df_value['login_days_cnt']

test_a_df_value['login_cnt_period1_avg']=test_a_df_value['login_cnt_period1']/test_a_df_value['login_days_cnt']
test_a_df_value['login_cnt_period2_avg']=test_a_df_value['login_cnt_period2']/test_a_df_value['login_days_cnt']
test_a_df_value['login_cnt_period']=test_a_df_value['login_cnt_period1']+test_a_df_value['login_cnt_period2']
test_a_df_value['login_cnt_period_avg']=test_a_df_value['login_cnt_period']/test_a_df_value['login_days_cnt']

test_a_df_value['service_cnt']=test_a_df_value['service1_cnt']+test_a_df_value['service2_cnt']
test_a_df_value['service_avg1_amt']=test_a_df_value['service1_amt']/test_a_df_value['service1_cnt']

test_a_df_value['op_cnt']=test_a_df_value['op1_cnt']+test_a_df_value['op2_cnt']

In [19]:
# 归一化
df_value = (df_value-df_value.min())/(df_value.max()-df_value.min())
test_a_df_value = (test_a_df_value-df_value.min())/(test_a_df_value.max()-test_a_df_value.min())

df_base = pd.DataFrame()
df_base['user'] = user
df_base = pd.concat([df_base, one_hot_df, df_value],axis=1)

test_a_df_base = pd.DataFrame()
test_a_df_base['user'] = test_a_user
test_a_df_base = pd.concat([test_a_df_base, test_a_one_hot_df, test_a_df_value],axis=1)

df_base.shape

(47782, 84)

In [20]:
test_a_df_base.shape

(24315, 84)

## OP处理

In [21]:
from collections import Counter

def filter_nan(num):
    num = num.dropna()
    return num

def most(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    return Counter(num).most_common(1)[0][1]

def most_item(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    return Counter(num).most_common(1)[0][0]

def least(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    return Counter(num).most_common()[:-2:-1][0][1]

def least_item(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    return Counter(num).most_common()[:-2:-1][0][0]

def mean_item(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    else:
        nums = list(Counter(num).values())
        return np.mean(nums)

def std_item(num):
    num=filter_nan(num)
    if num.empty:
        return np.nan
    else:
        nums = list(Counter(num).values())
        return np.std(nums)

In [22]:
op_df = pd.DataFrame()
group = train_op.groupby(['user']).count()
op_df['user'] = group.index

In [23]:
test_a_op_df = pd.DataFrame()
group = test_a_op.groupby(['user']).count()
test_a_op_df['user'] = group.index

In [24]:
value = train_op['op_type'].unique().tolist()
value.extend(test_a_op['op_type'].unique().tolist())
value = list(set(value))
op_type = {v:i for i,v in enumerate(value)}

value = train_op['op_mode'].unique().tolist()
value.extend(test_a_op['op_mode'].unique().tolist())
value = list(set(value))
op_mode = {v:i for i,v in enumerate(value)}

value = train_op['op_device'].unique().tolist()
value.extend(test_a_op['op_device'].unique().tolist())
value = list(set(value))
op_device = {v:i for i,v in enumerate(value)}

value = train_op['net_type'].unique().tolist()
value.extend(test_a_op['net_type'].unique().tolist())
value = list(set(value))
net_type = {v:i for i,v in enumerate(value)}

value = train_op['channel'].unique().tolist()
value.extend(test_a_op['channel'].unique().tolist())
value = list(set(value))
channel = {v:i for i,v in enumerate(value)}

In [25]:
def mutil_hot(column, column_dic, data):
    op_list = data.groupby('user')[column].apply(list)
    vector = np.zeros(shape=(len(op_list),len(column_dic)))
    for row,op in enumerate(op_list.values):
        for i in op:
            index=column_dic[i]
            vector[row][index]=1
    columns = [f'{column}_{i}' for i in range(len(column_dic))]
    tmp_df = pd.DataFrame(vector,columns=columns)
    return tmp_df

In [26]:
op_type_df = mutil_hot('op_type', op_type, train_op)
op_mode_df = mutil_hot('op_mode', op_mode, train_op)
op_device_df = mutil_hot('op_device', op_device, train_op)
net_type_df = mutil_hot('net_type', net_type, train_op)
channel_df = mutil_hot('channel', channel, train_op)

op_mutil_hot = pd.concat([op_type_df,op_mode_df,op_device_df,net_type_df,channel_df],axis=1)

In [27]:
op_type_df = mutil_hot('op_type', op_type, test_a_op)
op_mode_df = mutil_hot('op_mode', op_mode, test_a_op)
op_device_df = mutil_hot('op_device', op_device, test_a_op)
net_type_df = mutil_hot('net_type', net_type, test_a_op)
channel_df = mutil_hot('channel', channel, test_a_op)

test_a_op_mutil_hot = pd.concat([op_type_df,op_mode_df,op_device_df,net_type_df,channel_df],axis=1)

In [28]:
ip_value_df = train_op.groupby('user')['ip','ip_3'].agg([most,least,mean_item,std_item])
ip_value_df.fillna(0,inplace=True)
ip_value_df.index=list(range(41892))

ip_value_df.columns = [f'{name[0]}_{name[1]}' for name in ip_value_df.columns]

In [29]:
test_a_ip_value_df = test_a_op.groupby('user')['ip','ip_3'].agg([most,least,mean_item,std_item])
test_a_ip_value_df.fillna(0,inplace=True)
test_a_ip_value_df.index=list(range(20458))

test_a_ip_value_df.columns = [f'{name[0]}_{name[1]}' for name in test_a_ip_value_df.columns]

In [30]:
count_column=train_op.columns[1:-1]
print(count_column)
for column in count_column:
    op_df['op_'+column+'_count'] = train_op.groupby('user')[column].count().values
    op_df['op_'+column+'_nunique'] = train_op.groupby('user')[column].nunique().values

Index(['op_type', 'op_mode', 'op_device', 'ip', 'net_type', 'channel', 'ip_3'], dtype='object')


In [31]:
count_column=test_a_op.columns[1:]
print(count_column)
for column in count_column:
    test_a_op_df['op_'+column+'_count'] = test_a_op.groupby('user')[column].count().values
    test_a_op_df['op_'+column+'_nunique'] = test_a_op.groupby('user')[column].nunique().values

Index(['op_type', 'op_mode', 'op_device', 'ip', 'net_type', 'channel', 'ip_3',
       'tm_diff'],
      dtype='object')


In [32]:
from datetime import timedelta
def timedelta2sec(delta):
    day = delta.split('days')[0].strip()
    h,m,s = delta.split('days')[1].strip().split(':')
    sec=timedelta(days=int(day),hours=int(h),minutes=int(m),seconds=float(s)).total_seconds()
    return sec

train_op['time_diff_sec'] = train_op['tm_diff'].apply(timedelta2sec)
test_a_op['time_diff_sec'] = test_a_op['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    op_df['op_time_'+operate]=train_op.groupby('user')['time_diff_sec'].agg(operate).values
    test_a_op_df['op_time_'+operate]=test_a_op.groupby('user')['time_diff_sec'].agg(operate).values

In [33]:
op_df = pd.concat([op_df,op_mutil_hot, ip_value_df],axis=1)
test_a_op_df = pd.concat([test_a_op_df,test_a_op_mutil_hot, test_a_ip_value_df],axis=1)

In [34]:
op_df.fillna(0,inplace=True)
op_df.shape

(41892, 1493)

In [35]:
test_a_op_df.fillna(0,inplace=True)
test_a_op_df.shape

(20458, 1495)

## 交易处理

In [36]:
trans_df = pd.DataFrame()
group = train_trans.groupby(['user']).count()
trans_df['user'] = group.index

In [37]:
test_a_trans_df = pd.DataFrame()
group = test_a_trans.groupby(['user']).count()
test_a_trans_df['user'] = group.index

In [38]:
#  离散列，全部统计有几个类别
count_column = list(train_trans.columns[1:4])+list(train_trans.columns[5:9]) 
print(count_column)
for column in count_column:
    trans_df['trans_'+column+'_count'] = train_trans.groupby('user')[column].count().values
    trans_df['trans_'+column+'_nunique'] = train_trans.groupby('user')[column].nunique().values

['platform', 'tunnel_in', 'tunnel_out', 'type1', 'ip', 'type2', 'ip_3']


In [39]:
#  离散列，全部统计有几个类别
count_column = list(test_a_trans.columns[1:4])+list(test_a_trans.columns[5:9]) 
print(count_column)
for column in count_column:
    test_a_trans_df['trans_'+column+'_count'] = test_a_trans.groupby('user')[column].count().values
    test_a_trans_df['trans_'+column+'_nunique'] = test_a_trans.groupby('user')[column].nunique().values

['platform', 'tunnel_in', 'tunnel_out', 'type1', 'ip', 'type2', 'ip_3']


In [40]:
value = train_trans['platform'].unique().tolist()
value.extend(test_a_trans['platform'].unique().tolist())
value = list(set(value))
platform = {v:i for i,v in enumerate(value)}

value = train_trans['tunnel_in'].unique().tolist()
value.extend(test_a_trans['tunnel_in'].unique().tolist())
value = list(set(value))
tunnel_in = {v:i for i,v in enumerate(value)}

value = train_trans['tunnel_out'].unique().tolist()
value.extend(test_a_trans['tunnel_out'].unique().tolist())
value = list(set(value))
tunnel_out = {v:i for i,v in enumerate(value)}

value = train_trans['type1'].unique().tolist()
value.extend(test_a_trans['type1'].unique().tolist())
value = list(set(value))
type1 = {v:i for i,v in enumerate(value)}

value = train_trans['type2'].unique().tolist()
value.extend(test_a_trans['type2'].unique().tolist())
value = list(set(value))
type2 = {v:i for i,v in enumerate(value)}

In [41]:
def mutil_hot(column, column_dic, data):
    op_list = data.groupby('user')[column].apply(list)
    vector = np.zeros(shape=(len(op_list),len(column_dic)))
    for row,op in enumerate(op_list.values):
        for i in op:
            index=column_dic[i]
            vector[row][index]=1
    columns = [f'{column}_{i}' for i in range(len(column_dic))]
    tmp_df = pd.DataFrame(vector,columns=columns)
    return tmp_df

In [42]:
platform_df = mutil_hot('platform', platform, train_trans)
tunnel_in_df = mutil_hot('tunnel_in', tunnel_in, train_trans)
tunnel_out_df = mutil_hot('tunnel_out', tunnel_out, train_trans)
type1_df = mutil_hot('type1', type1, train_trans)
type2_df = mutil_hot('type2', type2, train_trans)

trans_mutil_hot = pd.concat([platform_df,tunnel_in_df,tunnel_out_df,type1_df,type2_df],axis=1)

In [43]:
platform_df = mutil_hot('platform', platform, test_a_trans)
tunnel_in_df = mutil_hot('tunnel_in', tunnel_in, test_a_trans)
tunnel_out_df = mutil_hot('tunnel_out', tunnel_out, test_a_trans)
type1_df = mutil_hot('type1', type1, test_a_trans)
type2_df = mutil_hot('type2', type2, test_a_trans)

test_a_trans_mutil_hot = pd.concat([platform_df,tunnel_in_df,tunnel_out_df,type1_df,type2_df],axis=1)

In [44]:
# count_column
for operate in ['max','min','mean','median','std']:
    trans_df['trans_amount_'+operate]=train_trans.groupby(['user'])['amount'].agg(operate).values
    
for operate in ['max','min','mean','median','std']:
    test_a_trans_df['trans_amount_'+operate]=test_a_trans.groupby(['user'])['amount'].agg(operate).values

In [45]:
# 处理时间
train_trans['time_diff_sec'] = train_trans['tm_diff'].apply(timedelta2sec)
test_a_trans['time_diff_sec'] = test_a_trans['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    trans_df['trans_time_'+operate]=train_trans.groupby('user')['time_diff_sec'].agg(operate).values

for operate in ['max','min','mean','median','std']:
    test_a_trans_df['trans_time_'+operate]=test_a_trans.groupby('user')['time_diff_sec'].agg(operate).values

In [46]:
ip_value_df = train_trans.groupby('user')['ip','ip_3'].agg([most,least,mean_item,std_item])
ip_value_df.fillna(0,inplace=True)
ip_value_df.index=list(range(41560))

ip_value_df.columns = [f'{name[0]}_{name[1]}' for name in ip_value_df.columns]

In [47]:
test_a_ip_value_df = test_a_trans.groupby('user')['ip','ip_3'].agg([most,least,mean_item,std_item])
test_a_ip_value_df.fillna(0,inplace=True)
test_a_ip_value_df.index=list(range(18813 ))

test_a_ip_value_df.columns = [f'{name[0]}_{name[1]}' for name in test_a_ip_value_df.columns]

In [48]:
trans_df = pd.concat([trans_df,ip_value_df,trans_mutil_hot],axis=1)
test_a_trans_df = pd.concat([test_a_trans_df,test_a_ip_value_df,test_a_trans_mutil_hot],axis=1)

In [49]:
# ip_value_df
trans_df.fillna(0,inplace=True)
trans_df.shape

(41560, 85)

In [50]:
test_a_trans_df.fillna(0,inplace=True)
test_a_trans_df.shape

(18813, 85)

## 数据合并

In [115]:
# train_df = df_base
train_df = pd.merge(df_base, op_df, on='user', how='left')
train_df = pd.merge(train_df, trans_df, on='user', how='left')
train_df.shape

(47782, 1660)

In [116]:
test_a_df = pd.merge(test_a_df_base, test_a_op_df, on='user', how='left')
test_a_df = pd.merge(test_a_df, test_a_trans_df, on='user', how='left')
test_a_df.shape

(24315, 1662)

In [117]:
train=train_df

In [118]:
train = train.sample(frac=1)

In [113]:
x = train.drop(columns=['user','label'],axis=1)
x = x.fillna(0)
y = train['label']

In [119]:
test_x = test_a_df.drop(columns=['user','label'],axis=1)
test_x = test_x.fillna(0)

In [120]:
length = len(train)
train_length = int(0.8*length)

x = train.drop(columns=['user','label'],axis=1)[0:train_length]
y = train['label'].values[0:train_length]
valid_x = train.drop(columns=['user','label'],axis=1)[train_length:]
valid_y = train['label'].values[train_length:]

x = x.fillna(0)
valid_x = valid_x.fillna(0)

In [127]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# model = LogisticRegression()
# model.fit(x,y)

model = LGBMClassifier(n_estimators=10, 
                       learning_rate=0.1,
                       subsample=0.6,
                       colsample_bytree=0.6,
                       importance_type='gain')
model.fit(x, y,
          eval_set=(valid_x, valid_y),
          early_stopping_rounds=5
          )

prediction = model.predict_proba(valid_x)[:,1]
auc = roc_auc_score(valid_y,prediction)
print(auc)

prediction = model.predict_proba(x)[:,1]
auc = roc_auc_score(y,prediction)
print(auc)

[1]	valid_0's binary_logloss: 0.547101
Training until validation scores don't improve for 5 rounds
[2]	valid_0's binary_logloss: 0.540479
[3]	valid_0's binary_logloss: 0.534695
[4]	valid_0's binary_logloss: 0.530298
[5]	valid_0's binary_logloss: 0.526441
[6]	valid_0's binary_logloss: 0.522841
[7]	valid_0's binary_logloss: 0.519587
[8]	valid_0's binary_logloss: 0.516542
[9]	valid_0's binary_logloss: 0.514254
[10]	valid_0's binary_logloss: 0.512034
Did not meet early stopping. Best iteration is:
[10]	valid_0's binary_logloss: 0.512034
0.7172814652218439
0.7397624864606708


In [109]:
kfold= KFold(n_splits=5,random_state =None)

model = LGBMClassifier(n_estimators=20, 
                   learning_rate=0.1,
                   subsample=0.8,
                   colsample_bytree=0.8)
auc=0
i=0
for train_index,test_index in kfold.split(x,y):
    model.fit(x.iloc[train_index], y.iloc[train_index],
              eval_set=(x.iloc[test_index], y.iloc[test_index]),
              )
    prediction = model.predict_proba(x.iloc[test_index])[:,1]
    auc += roc_auc_score(y.iloc[test_index],prediction)
    i+=1
    joblib.dump(model,f'model_{i}.pkl')
print(auc/5)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
joblib.dump(model,'model.pkl')

In [None]:
pd.DataFrame({'column': x.columns,'importance': np.abs(model.coef_[0])}).sort_values(by='importance')[-100:]

In [96]:
feature_df = pd.DataFrame({'column': x.columns,'importance': model.feature_importances_}).sort_values(by='importance', ascending=False)

In [97]:
feature_df[0:100]

Unnamed: 0,column,importance
1586,trans_ip_3_count,4156.209309
1582,trans_ip_count,2603.879028
72,product7_success_cnt,1278.944992
1574,trans_platform_count,759.605564
1591,trans_amount_median,605.355995
189,op_type_88,394.354897
71,product7_fail_cnt,266.352471
41,city_class_3,256.085608
1600,ip_mean_item_y,252.860542
1589,trans_amount_min,206.781958


In [98]:
select_feature=feature_df['column'][0:50]

x = x[select_feature]
valid_x = valid_x[select_feature]

In [99]:
x.shape

(38225, 50)

In [259]:
# from catboost import CatBoostClassifier


# model = CatBoostClassifier(
#     iterations=500,
#     random_seed=42,
#     logging_level='Silent'
# )

# model.fit(
#     x.values, y,
#     eval_set=(valid_x.values, valid_y),
# #     logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
# )

# prediction = model.predict_proba(valid_x)[:,1]
# auc = roc_auc_score(valid_y,prediction)
# auc

In [68]:
test_x = test_x[select_feature]

In [557]:
prob = 0
for i in range(1,6):
    model = joblib.load(f'model_{i}.pkl')
    prob += model.predict_proba(test_x)[:,1]
prob = prob/5

In [69]:
prob = model.predict_proba(test_x)[:,1]

In [70]:
prob.mean()

0.25390859349709244

In [621]:
prob.shape

(24315,)

In [622]:
df_res = pd.DataFrame()
df_res['user'] = test_a_df['user'].values
df_res['prob'] = prob

df_res.to_csv('result1.csv', index=False)