In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from tqdm import tqdm

pd.options.mode.chained_assignment = None

In [2]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', 100)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

In [3]:
train_base = pd.read_csv('data/train_base.csv')
test_a_base = pd.read_csv('data/test_a_base.csv')

train_op = pd.read_csv('data/train_op.csv')
test_a_op = pd.read_csv('data/test_a_op.csv')

train_trans = pd.read_csv('data/train_trans.csv')
test_a_trans = pd.read_csv('data/test_a_trans.csv')

train_label = pd.read_csv('data/train_label.csv')

In [4]:
train_base = pd.read_csv('data/train_base.csv')
train_base = pd.merge(train_base, train_label, on='user', how='left')

# base

### 填充nan，去除user列

In [5]:
train_base_no_nan = train_base.drop(columns=['service3_level'],axis=1)
# 离散值填充众数
train_base_no_nan['sex'].fillna('category 0',inplace=True)
train_base_no_nan['balance_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance1_avg'].fillna('level 1',inplace=True)
train_base_no_nan['balance2_avg'].fillna('level 1',inplace=True)

user = train_base_no_nan['user'].values
train_base_no_user = train_base_no_nan.drop(columns=['user'],axis=1)

### one-hot

In [6]:
category_columns_name = ['sex','provider','level','verified','agreement1','agreement2','agreement3','agreement4','service3',
                        'product3_amount','product4_amount','product5_amount']

one_hot_df = pd.get_dummies(train_base_no_user[category_columns_name])

one_hot_df['agreement_all_0'] = one_hot_df['agreement1_category 0'].values & one_hot_df['agreement2_category 0'].values & \
                                 one_hot_df['agreement3_category 0'].values & one_hot_df['agreement4_category 0'].values
one_hot_df['agreement_all_1'] = one_hot_df['agreement1_category 1'].values & one_hot_df['agreement2_category 1'].values & \
                                 one_hot_df['agreement3_category 1'].values & one_hot_df['agreement4_category 1'].values

### mean encoding

In [7]:
city_mean = joblib.load('city_mean')[0]
province_mean = joblib.load('province_mean')[0]
regist_type_mean = joblib.load('regist_type_mean')[0]
balance_mean = joblib.load('balance_mean')[0]
balance_avg_mean = joblib.load('balance_avg_mean')[0]
balance1_mean = joblib.load('balance1_mean')[0]
balance1_avg_mean = joblib.load('balance1_avg_mean')[0]
balance2_mean = joblib.load('balance2_mean')[0]
balance2_avg_mean = joblib.load('balance2_avg_mean')[0]
product1_amount_mean = joblib.load('product1_amount_mean')[0]
product2_amount_mean = joblib.load('product2_amount_mean')[0]
product6_amount_mean = joblib.load('product6_amount_mean')[0]

FileNotFoundError: [Errno 2] No such file or directory: 'city_mean'

In [None]:
train_base_categoty = pd.DataFrame()
train_base_categoty['city'] = city_mean
train_base_categoty['province'] = province_mean
train_base_categoty['regist_type'] = regist_type_mean
train_base_categoty['balance'] = balance_mean
train_base_categoty['balance_avg'] = balance_avg_mean
train_base_categoty['balance1'] = balance1_mean
train_base_categoty['balance1_avg'] = balance1_avg_mean
train_base_categoty['balance2'] = balance2_mean
train_base_categoty['balance2_avg'] = balance2_avg_mean
train_base_categoty['product1_amount'] = product1_amount_mean
train_base_categoty['product2_amount'] = product2_amount_mean
train_base_categoty['product6_amount'] = product6_amount_mean

In [None]:
def magic_feature(df, f1, f2):
    df[f'{f1}_{f2}_a'] = df[f1]+df[f2]
    df[f'{f1}_{f2}_s'] = df[f1]-df[f2]
    df[f'{f1}_{f2}_m'] = df[f1]*df[f2]
    df[f'{f1}_{f2}_d'] = df[f1]/df[f2]

In [None]:
magic_feature(train_base_categoty,'city','province')
magic_feature(train_base_categoty,'balance1','balance2')
magic_feature(train_base_categoty,'balance1_avg','balance2_avg')

In [None]:
train_base_categoty['product_amount_a'] = train_base_categoty['product1_amount']+train_base_categoty['product2_amount']+ \
                                        train_base_categoty['product6_amount']
train_base_categoty['product_amount_m'] = train_base_categoty['product1_amount']*train_base_categoty['product2_amount']* \
                                        train_base_categoty['product6_amount']

### 连续变量的处理

In [None]:
df_value = train_base_no_user.select_dtypes('int64')

In [None]:
df_value['product7_success_cnt'] = df_value['product7_cnt']-df_value['product7_fail_cnt']

df_value['card_cnt'] = df_value['card_a_cnt']+df_value['card_b_cnt']+df_value['card_c_cnt']+df_value['card_d_cnt']

df_value['ip_cnt_avg'] = df_value['ip_cnt']/df_value['login_days_cnt']

df_value['login_cnt_period1_avg']=df_value['login_cnt_period1']/df_value['login_days_cnt']
df_value['login_cnt_period2_avg']=df_value['login_cnt_period2']/df_value['login_days_cnt']
df_value['login_cnt_period']=df_value['login_cnt_period1']+df_value['login_cnt_period2']
df_value['login_cnt_period_avg']=df_value['login_cnt_period']/df_value['login_days_cnt']

df_value['service_cnt']=df_value['service1_cnt']+df_value['service2_cnt']
df_value['service_avg1_amt']=df_value['service1_amt']/df_value['service1_cnt']

df_value['op_cnt']=df_value['op1_cnt']+df_value['op2_cnt']

In [None]:
# 归一化
df_value = (df_value-df_value.min())/(df_value.max()-df_value.min())

df_base = pd.DataFrame()
df_base['user'] = user
df_base = pd.concat([df_base, one_hot_df, train_base_categoty, df_value],axis=1)

In [None]:
print(df_base.shape)
df_base.head()

## OP处理

In [None]:
op_df = pd.DataFrame()
group = train_op.groupby(['user']).count()
op_df['user'] = group.index

In [None]:
train_op.head()

In [None]:
count_column=train_op.columns[1:-1]
print(count_column)
for column in count_column:
    op_df['op_'+column+'_count'] = train_op.groupby('user')[column].count().values
    op_df['op_'+column+'_nunique'] = train_op.groupby('user')[column].nunique().values

In [None]:
from datetime import timedelta
def timedelta2sec(delta):
    day = delta.split('days')[0].strip()
    h,m,s = delta.split('days')[1].strip().split(':')
    sec=timedelta(days=int(day),hours=int(h),minutes=int(m),seconds=float(s)).total_seconds()
    return sec

train_op['time_diff_sec'] = train_op['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    op_df['op_time_'+operate]=train_op.groupby('user')['time_diff_sec'].agg(operate).values

In [None]:
op_df.fillna(0,inplace=True)
print(op_df.shape)
op_df.head()

## 交易处理

In [None]:
trans_df = pd.DataFrame()
group = train_trans.groupby(['user']).count()
trans_df['user'] = group.index

### 离散列，全部统计有几个类别

In [None]:
count_column = list(train_trans.columns[1:4])+list(train_trans.columns[5:9]) 
for column in count_column:
    print(column)
    trans_df['trans_'+column+'_count'] = train_trans.groupby('user')[column].count().values
    trans_df['trans_'+column+'_nunique'] = train_trans.groupby('user')[column].nunique().values
#     des_df = train_trans.groupby('user')[column].describe()
#     trans_df['trans_'+column+'_count'] = des_df['count'].values
#     trans_df['trans_'+column+'_n'] = des_df['unique'].values
#     trans_df['trans_'+column+'_fre'] = des_df['freq'].values

### 处理amount

In [None]:
for operate in ['max','min','mean','median','std']:
    trans_df['trans_amount_'+operate]=train_trans.groupby(['user'])['amount'].agg(operate).values

### 处理时间

In [None]:
train_trans['time_diff_sec'] = train_trans['tm_diff'].apply(timedelta2sec)

for operate in ['max','min','mean','median','std']:
    trans_df['trans_time_'+operate]=train_trans.groupby('user')['time_diff_sec'].agg(operate).values

### 标准差、freq有空值，用0填充

In [None]:
trans_df.fillna(0,inplace=True)

In [None]:
trans_df.head()

In [None]:
trans_df.shape

## 数据合并

In [None]:
# train_df = df_base
train_df = pd.merge(df_base, op_df, on='user', how='left')
train_df = pd.merge(train_df, trans_df, on='user', how='left')

train=train_df

In [None]:
train.shape

In [None]:
length = len(train)
train_length = int(0.9*length)

x = train.drop(columns=['user','label'],axis=1)[0:train_length]
y = train['label'].values[0:train_length]
valid_x = train.drop(columns=['user','label'],axis=1)[train_length:]
valid_y = train['label'].values[train_length:]

x = x.fillna(0)
valid_x = valid_x.fillna(0)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# model = LogisticRegression()
# model.fit(x,y)

model = LGBMClassifier(n_estimators=1000, 
                       learning_rate=0.045,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       reg_alpha=100,
                       reg_lambda=100)
model.fit(x, y,
          eval_set=(valid_x, valid_y),
          early_stopping_rounds=5
          )

prediction = model.predict_proba(valid_x)[:,1]
auc = roc_auc_score(valid_y,prediction)
auc

In [None]:
joblib.dump(model,'model.pkl')

In [None]:
pd.DataFrame({'column': x.columns,'importance': np.abs(model.coef_[0])}).sort_values(by='importance')[-100:]

In [None]:
pd.DataFrame({'column': x.columns,'importance': model.feature_importances_}).sort_values(by='importance')[-100:]

In [None]:
# select_feature=importance_df['column']

# x = x[select_feature]
# valid_x = valid_x[select_feature]

In [None]:
# from catboost import CatBoostClassifier


# model = CatBoostClassifier(
#     iterations=500,
#     random_seed=42,
#     logging_level='Silent'
# )

# model.fit(
#     x.values, y,
#     eval_set=(valid_x.values, valid_y),
# #     logging_level='Verbose',  # you can uncomment this for text output
#     plot=True
# )

# prediction = model.predict_proba(valid_x)[:,1]
# auc = roc_auc_score(valid_y,prediction)
# auc

baseline:
0.6769352059269422

添加trans amount的扩展变量：
0.6782946345739088

添加trans ip3的扩展变量：
0.6791192236191647

添加trans type直接数值化：
0.6796788342378327

添加所有的des特征，并修改了学习率：
0.6823851869318595

添加avg特征：
0.6831416556804824