In [121]:
import gc
import pandas as pd

In [122]:
# 使用format1进行加载
# 加载小样本
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('data_format1_small/train.csv')
submission = pd.read_csv('data_format1_small/test.csv')

In [123]:
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,89355,664924,1429,3136,579.0,1111,2
1,89355,131438,1505,780,516.0,1110,0
2,89355,673082,1429,3136,579.0,1110,0
3,89355,664924,1429,3136,579.0,1110,0
4,89355,183665,1505,780,516.0,1110,0


In [124]:
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,191719,4.0,0.0
1,391524,5.0,1.0
2,184971,4.0,0.0
3,396296,4.0,0.0
4,39031,4.0,0.0


In [125]:
train_data1.head()

Unnamed: 0,user_id,merchant_id,label
0,365952,1203,0
1,42624,946,0
2,240000,2278,0
3,177792,951,0
4,322944,1892,0


In [126]:
submission.head()

Unnamed: 0,user_id,merchant_id,prob
0,40320,4173,0
1,309120,4775,1
2,117120,3826,0
3,127104,4048,1
4,331392,1978,0


In [127]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)

In [128]:
print(matrix)

       user_id  merchant_id  label origin  prob
0       365952         1203    0.0  train   NaN
1        42624          946    0.0  train   NaN
2       240000         2278    0.0  train   NaN
3       177792          951    0.0  train   NaN
4       322944         1892    0.0  train   NaN
...        ...          ...    ...    ...   ...
23888    47231         1748    NaN   test   0.0
23889    59519          798    NaN   test   0.0
23890   263039          639    NaN   test   0.0
23891   263039         3954    NaN   test   0.0
23892   423551         2954    NaN   test   0.0

[23893 rows x 5 columns]


In [129]:
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(user_info, on='user_id', how='left')

In [130]:
# 使用merchant_id(原列名seller_id)
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
matrix['age_range'].fillna(0, inplace=True)    # 0 unknown
matrix['gender'].fillna(2, inplace=True)    # 2 unknown
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
print(matrix)

       user_id  merchant_id label origin  age_range  gender
0       365952         1203   0.0  train          0       1
1        42624          946   0.0  train          2       0
2       240000         2278   0.0  train          3       0
3       177792          951   0.0  train          0       1
4       322944         1892   0.0  train          7       0
...        ...          ...   ...    ...        ...     ...
23888    47231         1748   nan   test          0       0
23889    59519          798   nan   test          3       0
23890   263039          639   nan   test          2       1
23891   263039         3954   nan   test          2       1
23892   423551         2954   nan   test          4       0

[23893 rows x 6 columns]


In [131]:
# User 特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

In [132]:
# 使用agg 基于列的聚合操作，统计唯一值的个数item_id, cat_id, merchant_id, brand_id
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id',how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id',how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id',how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id',how='left')

In [133]:
# matrix.head()
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,89355,664924,1429,3136,579,1900-01-01 11:11:00,2
1,89355,131438,1505,780,516,1900-01-01 11:10:00,0
2,89355,673082,1429,3136,579,1900-01-01 11:10:00,0
3,89355,664924,1429,3136,579,1900-01-01 11:10:00,0
4,89355,183665,1505,780,516,1900-01-01 11:10:00,0


In [134]:
# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time','min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time']-temp['F_time']).dt.seconds/3600   
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,u5,u6
0,365952,1203,0.0,train,0,1,46,29,12,16,16,4.933333
1,42624,946,0.0,train,2,0,365,198,46,46,45,5.866667
2,240000,2278,0.0,train,3,0,47,31,14,15,17,5.833333
3,177792,951,0.0,train,0,1,234,105,23,35,36,5.833333
4,322944,1892,0.0,train,7,0,186,106,34,40,39,5.866667


In [135]:
# 统计操作类型为0，1 ，2， 3的 个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
print(temp)

action_type  user_id     u7  u8    u9  u10
0                  9   79.0 NaN   4.0  4.0
1                 21  582.0 NaN  31.0  7.0
2                 22  463.0 NaN   8.0  NaN
3                 45   19.0 NaN   1.0  NaN
4                 48   30.0 NaN  10.0  7.0
...              ...    ...  ..   ...  ...
19107         424069   88.0 NaN  12.0  NaN
19108         424074   23.0 NaN   4.0  NaN
19109         424076   38.0 NaN   9.0  NaN
19110         424110   46.0 NaN   5.0  NaN
19111         424139   15.0 NaN   8.0  NaN

[19112 rows x 5 columns]


In [136]:
matrix = matrix.merge(temp, on='user_id', how='left')

In [137]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 商家被交互的user_id, item_id, cat_id, brand_id 唯一值个数
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 
                                                                                                 'item_id':'m3',
                                                                                                 'cat_id':'m4', 
                                                                                                 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
print(matrix)


       user_id  merchant_id label origin  age_range  gender    u1   u2  u3  \
0       365952         1203   0.0  train          0       1    46   29  12   
1        42624          946   0.0  train          2       0   365  198  46   
2       240000         2278   0.0  train          3       0    47   31  14   
3       177792          951   0.0  train          0       1   234  105  23   
4       322944         1892   0.0  train          7       0   186  106  34   
...        ...          ...   ...    ...        ...     ...   ...  ...  ..   
23888    47231         1748   nan   test          0       0   128   97  28   
23889    59519          798   nan   test          3       0  1286  540  55   
23890   263039          639   nan   test          2       1     9    8   7   
23891   263039         3954   nan   test          2       1     9    8   7   
23892   423551         2954   nan   test          4       0   197   85  36   

       u4  ...        u6      u7  u8    u9   u10    m1   m2   m

In [138]:
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
print(matrix)

       user_id  merchant_id label origin  age_range  gender    u1   u2  u3  \
0       365952         1203   0.0  train          0       1    46   29  12   
1        42624          946   0.0  train          2       0   365  198  46   
2       240000         2278   0.0  train          3       0    47   31  14   
3       177792          951   0.0  train          0       1   234  105  23   
4       322944         1892   0.0  train          7       0   186  106  34   
...        ...          ...   ...    ...        ...     ...   ...  ...  ..   
23888    47231         1748   nan   test          0       0   128   97  28   
23889    59519          798   nan   test          3       0  1286  540  55   
23890   263039          639   nan   test          2       1     9    8   7   
23891   263039         3954   nan   test          2       1     9    8   7   
23892   423551         2954   nan   test          4       0   197   85  36   

       u4  ...   u10    m1   m2   m3  m4  m5      m6    m7     

In [139]:
# 按照merchant_id统计随机负采样的个数
train_data = pd.read_csv('./data_format2/train_format2.csv')
train_data.head()
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
print(matrix)

       user_id  merchant_id label origin  age_range  gender    u1   u2  u3  \
0       365952         1203   0.0  train          0       1    46   29  12   
1        42624          946   0.0  train          2       0   365  198  46   
2       240000         2278   0.0  train          3       0    47   31  14   
3       177792          951   0.0  train          0       1   234  105  23   
4       322944         1892   0.0  train          7       0   186  106  34   
...        ...          ...   ...    ...        ...     ...   ...  ...  ..   
23888    47231         1748   nan   test          0       0   128   97  28   
23889    59519          798   nan   test          3       0  1286  540  55   
23890   263039          639   nan   test          2       1     9    8   7   
23891   263039         3954   nan   test          2       1     9    8   7   
23892   423551         2954   nan   test          4       0   197   85  36   

       u4  ...    m1   m2   m3  m4  m5      m6    m7     m8    

In [140]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
# 统计行为个数
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 
                                                                                       'cat_id':'um3', 
                                                                                      'brand_id':'um4'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 
                                                                                     2:'um7', 3:'um8'})
matrix = matrix.merge(temp, on=['user_id','merchant_id'], how='left')
# 统计时间间隔
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last']-temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,m10,um1,um2,um3,um4,um5,um6,um7,um8,um9
0,365952,1203,0.0,train,0,1,46,29,12,16,...,3518,8,4,2,1,7.0,,1.0,,0.016667
1,42624,946,0.0,train,2,0,365,198,46,46,...,2816,42,14,2,1,34.0,,7.0,1.0,0.016667
2,240000,2278,0.0,train,3,0,47,31,14,15,...,2604,2,1,1,1,1.0,,1.0,,0.0
3,177792,951,0.0,train,0,1,234,105,23,35,...,1932,11,4,1,1,10.0,,1.0,,0.016667
4,322944,1892,0.0,train,7,0,186,106,34,40,...,5471,19,10,6,1,10.0,,9.0,,0.016667


In [141]:
# 用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7']
# 商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6']
# 不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
print(matrix)

       user_id  merchant_id label origin  age_range  gender    u1   u2  u3  \
0       365952         1203   0.0  train          0       1    46   29  12   
1        42624          946   0.0  train          2       0   365  198  46   
2       240000         2278   0.0  train          3       0    47   31  14   
3       177792          951   0.0  train          0       1   234  105  23   
4       322944         1892   0.0  train          7       0   186  106  34   
...        ...          ...   ...    ...        ...     ...   ...  ...  ..   
23888    47231         1748   nan   test          0       0   128   97  28   
23889    59519          798   nan   test          3       0  1286  540  55   
23890   263039          639   nan   test          2       1     9    8   7   
23891   263039         3954   nan   test          2       1     9    8   7   
23892   423551         2954   nan   test          4       0   197   85  36   

       u4  ...  um3  um4   um5  um6  um7  um8       um9        

In [142]:
# 修改age_range字段名称为age_0, age_1, age_2, ....,age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
print(matrix)

       user_id  merchant_id label origin    u1   u2  u3  u4  u5        u6  \
0       365952         1203   0.0  train    46   29  12  16  16  4.933333   
1        42624          946   0.0  train   365  198  46  46  45  5.866667   
2       240000         2278   0.0  train    47   31  14  15  17  5.833333   
3       177792          951   0.0  train   234  105  23  35  36  5.833333   
4       322944         1892   0.0  train   186  106  34  40  39  5.866667   
...        ...          ...   ...    ...   ...  ...  ..  ..  ..       ...   
23888    47231         1748   nan   test   128   97  28  39  40  5.816667   
23889    59519          798   nan   test  1286  540  55  93  96  6.000000   
23890   263039          639   nan   test     9    8   7   7   7  5.783333   
23891   263039         3954   nan   test     9    8   7   7   7  5.783333   
23892   423551         2954   nan   test   197   85  36  39  40  5.916667   

       ...  age_2  age_3  age_4  age_5  age_6  age_7  age_8  g_0  g_1  g_2 

In [143]:
# 分割训练数据和测试数据
train_data = matrix[matrix['origin']=='train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin']=='test'].drop(['label', 'origin'],axis=1)
x_train, y_train = train_data.drop(['label'], axis=1), train_data['label']
# del temp, matrix
# gc.collect()

In [144]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import xgboost as xgb


In [145]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
# 使用XGBoost
model = xgb.XGBClassifier(max_depth=8,
                         n_estimators=1000, 
                         min_child_weight=300, 
                         colsample_bylevel=0.8, 
                         subsample=0.8,
                         eta=0.3,
                         seed=42)
model.fit(x_train, y_train, eval_metric='auc', eval_set=[(x_train, y_train), (x_test, y_test)], verbose=True,
         # 早停法，如果auc在10epoch没有进步就stop
         early_stopping_rounds=10)
model.fit(x_train, y_train)
model.score(x_test, y_test)
# prob = model.predict_proba(test_data)

[0]	validation_0-auc:0.587132	validation_1-auc:0.544609
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.601003	validation_1-auc:0.56358
[2]	validation_0-auc:0.618314	validation_1-auc:0.577322
[3]	validation_0-auc:0.629705	validation_1-auc:0.587584
[4]	validation_0-auc:0.634931	validation_1-auc:0.596646
[5]	validation_0-auc:0.636111	validation_1-auc:0.59682
[6]	validation_0-auc:0.637842	validation_1-auc:0.593663
[7]	validation_0-auc:0.637049	validation_1-auc:0.588446
[8]	validation_0-auc:0.637302	validation_1-auc:0.590855
[9]	validation_0-auc:0.637393	validation_1-auc:0.590859
[10]	validation_0-auc:0.637371	validation_1-auc:0.591827
[11]	validation_0-auc:0.637946	validation_1-auc:0.594021
[12]	validation_0-auc:0.636454	validation_1-auc:0.594042
[13]	validation_0-auc:0.638106	validation_1-auc:0.595595
[14]	validation_0-auc:0.63971	validation_1-auc:0.595194
[

0.9386210762331838

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14269 entries, 14563 to 5984
Data columns (total 45 columns):
user_id        14269 non-null int32
merchant_id    14269 non-null int32
u1             14269 non-null int64
u2             14269 non-null int64
u3             14269 non-null int64
u4             14269 non-null int64
u5             14269 non-null int64
u6             14269 non-null float64
u7             14269 non-null float64
u8             14269 non-null float64
u9             14269 non-null float64
u10            14269 non-null float64
m1             14269 non-null int64
m2             14269 non-null int64
m3             14269 non-null int64
m4             14269 non-null int64
m5             14269 non-null int64
m6             14269 non-null float64
m7             14269 non-null float64
m8             14269 non-null float64
m9             14269 non-null float64
m10            14269 non-null int64
um1            14269 non-null int64
um2            14269 non-null int64
um3   

In [149]:
y_train = pd.to_numeric(y_train)
y_train
y_train.value_counts()
y_test = pd.to_numeric(y_test)

In [156]:
# 使用LightGBM
import lightgbm as lgb

param = {'boosting_type':'gbdt', 
         'objective':'binary',
         'metric':'auc',
         'learning_rate':0.01, 
         'max_depth':15,
         'feature_fraction':0.8,
         'bagging_fraction':0.9,
         'bagging_freq':8,
         'lambda_l1':0.6,
         'lambda_l2':0,
    
}
# params = {
#     "objective": "binary",
#     "boosting": "gbdt",
#     "num_leaves": 1280,
#     "learning_rate": 0.05,
#     "feature_fraction": 0.85,
#     "reg_lambda": 2,
#     "metric": "auc",
#     "random_seed":10
# }
train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
model = lgb.train(params, train_data, num_boost_round=10000, early_stopping_rounds=100, 
                  valid_sets=[train_data, val_data])
predict = model.predict(x_test)
print(predict)


[1]	training's auc: 0.891205	valid_1's auc: 0.549435
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.913985	valid_1's auc: 0.547499
[3]	training's auc: 0.941568	valid_1's auc: 0.545963
[4]	training's auc: 0.955734	valid_1's auc: 0.534824
[5]	training's auc: 0.966434	valid_1's auc: 0.547309
[6]	training's auc: 0.972228	valid_1's auc: 0.543393
[7]	training's auc: 0.975524	valid_1's auc: 0.544305
[8]	training's auc: 0.980473	valid_1's auc: 0.542575
[9]	training's auc: 0.982676	valid_1's auc: 0.543269
[10]	training's auc: 0.986026	valid_1's auc: 0.546308
[11]	training's auc: 0.987854	valid_1's auc: 0.551168
[12]	training's auc: 0.98945	valid_1's auc: 0.55299
[13]	training's auc: 0.99057	valid_1's auc: 0.558291
[14]	training's auc: 0.991805	valid_1's auc: 0.563003
[15]	training's auc: 0.992691	valid_1's auc: 0.561006
[16]	training's auc: 0.993624	valid_1's auc: 0.561107
[17]	training's auc: 0.994213	valid_1's auc: 0.564311
[18]	training's auc: 0.994758	va

[0.01953173 0.02296858 0.0939187  ... 0.07748268 0.01432783 0.03037769]
