In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
# 用户行为，使用format1进行加载
# 加载全量样本

user_log = pd.read_csv('./user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./user_info_format1.csv')
train_data1 = pd.read_csv('./train_format1.csv')
submission = pd.read_csv('./test_format1.csv')
train_data = pd.read_csv('./train_format2.csv')

In [3]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix)

In [4]:
# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')


In [5]:
# 对离散特征做LabelEncoder
lbe_merchant_id=LabelEncoder()
lbe_merchant_id.fit(np.r_[0,user_log['merchant_id'].values])
user_log['merchant_id']=lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id']=lbe_merchant_id.transform(matrix['merchant_id'])


In [6]:
lbe_user_id=LabelEncoder()
user_log['user_id']=lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id']=lbe_user_id.transform(user_info['user_id'])
matrix['user_id']=lbe_user_id.transform(matrix['user_id'])

lbe_item_id=LabelEncoder()
user_log['item_id']=lbe_item_id.fit_transform(user_log['item_id'])
lbe_cat_id=LabelEncoder()
user_log['cat_id']=lbe_cat_id.fit_transform(user_log['cat_id'])
lbe_brand_id=LabelEncoder()
user_log['brand_id']=lbe_brand_id.fit_transform(user_log['brand_id'])

user_log['merchant_id'].max(),user_log['user_id'].max()
matrix = matrix.merge(user_info, on='user_id', how='left')

In [7]:
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()
print(matrix)

        user_id  merchant_id label origin  prob  age_range  gender
0         34175         3906   0.0  train   NaN          6       0
1         34175          121   0.0  train   NaN          6       0
2         34175         4356   1.0  train   NaN          6       0
3         34175         2217   0.0  train   NaN          6       0
4        230783         4818   0.0  train   NaN          0       0
...         ...          ...   ...    ...   ...        ...     ...
522336   228478         3111   nan   test   NaN          6       0
522337    97918         2341   nan   test   NaN          8       1
522338    97918         3971   nan   test   NaN          8       1
522339    32638         3536   nan   test   NaN          0       0
522340    32638         3319   nan   test   NaN          0       0

[522341 rows x 7 columns]


In [8]:
# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')


In [9]:
# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计action_type为0，1，2，3的个数（原始操作，没有补0）
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
print(matrix)

        user_id  merchant_id label origin  prob  age_range  gender    u1  \
0         34175         3906   0.0  train   NaN          6       0   451   
1         34175          121   0.0  train   NaN          6       0   451   
2         34175         4356   1.0  train   NaN          6       0   451   
3         34175         2217   0.0  train   NaN          6       0   451   
4        230783         4818   0.0  train   NaN          0       0    54   
...         ...          ...   ...    ...   ...        ...     ...   ...   
522336   228478         3111   nan   test   NaN          6       0  2004   
522337    97918         2341   nan   test   NaN          8       1    55   
522338    97918         3971   nan   test   NaN          8       1    55   
522339    32638         3536   nan   test   NaN          0       0    72   
522340    32638         3319   nan   test   NaN          0       0    72   

          u2  u3   u4   u5        u6      u7   u8    u9    u10  
0        256  45  109 

In [10]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
print(matrix)

  import sys


        user_id  merchant_id label origin  prob  age_range  gender    u1  \
0         34175         3906   0.0  train   NaN          6       0   451   
1         34175          121   0.0  train   NaN          6       0   451   
2         34175         4356   1.0  train   NaN          6       0   451   
3         34175         2217   0.0  train   NaN          6       0   451   
4        230783         4818   0.0  train   NaN          0       0    54   
...         ...          ...   ...    ...   ...        ...     ...   ...   
522336   228478         3111   nan   test   NaN          6       0  2004   
522337    97918         2341   nan   test   NaN          8       1    55   
522338    97918         3971   nan   test   NaN          8       1    55   
522339    32638         3536   nan   test   NaN          0       0    72   
522340    32638         3319   nan   test   NaN          0       0    72   

          u2  u3  ...     m1     m2    m3   m4  m5       m6     m7      m8  \
0        

In [11]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
print(temp)
print('-'*100)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
print(matrix)

  """


          user_id  merchant_id  um9
0               0          471  0.0
1               0          739  0.0
2               0          925  0.0
3               0         1019  0.0
4               0         1156  0.0
...           ...          ...  ...
14058661   424169         1082  0.0
14058662   424169         3469  0.0
14058663   424169         3736  0.0
14058664   424169         4268  0.1
14058665   424169         4963  0.0

[14058666 rows x 3 columns]
----------------------------------------------------------------------------------------------------
        user_id  merchant_id label origin  prob  age_range  gender    u1  \
0         34175         3906   0.0  train   NaN          6       0   451   
1         34175          121   0.0  train   NaN          6       0   451   
2         34175         4356   1.0  train   NaN          6       0   451   
3         34175         2217   0.0  train   NaN          6       0   451   
4        230783         4818   0.0  train   NaN          0

In [12]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
print(matrix)

        user_id  merchant_id label origin  prob    u1    u2  u3   u4   u5  \
0         34175         3906   0.0  train   0.0   451   256  45  109  108   
1         34175          121   0.0  train   0.0   451   256  45  109  108   
2         34175         4356   1.0  train   0.0   451   256  45  109  108   
3         34175         2217   0.0  train   0.0   451   256  45  109  108   
4        230783         4818   0.0  train   0.0    54    31  17   20   19   
...         ...          ...   ...    ...   ...   ...   ...  ..  ...  ...   
522336   228478         3111   nan   test   0.0  2004  1173  71  278  282   
522337    97918         2341   nan   test   0.0    55    29  14   17   17   
522338    97918         3971   nan   test   0.0    55    29  14   17   17   
522339    32638         3536   nan   test   0.0    72    46  24   33   35   
522340    32638         3319   nan   test   0.0    72    46  24   33   35   

        ...  age_2  age_3  age_4  age_5  age_6  age_7  age_8  g_0  g_1  g_2

In [13]:
lbe_action_type={0:1,1:2,2:3,3:4}
user_log['action_type']=user_log['action_type'].map(lbe_action_type)
# 用户行为sequence
# 把user_log里同user的这些数据合并成一个list
temp=pd.DataFrame(user_log.groupby('user_id')['merchant_id','action_type'].agg(lambda x:list(x)))
# 列名称改成hist_merchant_id 和 hist_action_type 
temp.columns=['hist_merchant_id','hist_action_type']
#print(temp)
matrix = matrix.merge(temp, on=['user_id'], how='left') #统计时间间隔
print(matrix)

  """


        user_id  merchant_id label origin  prob    u1    u2  u3   u4   u5  \
0         34175         3906   0.0  train   0.0   451   256  45  109  108   
1         34175          121   0.0  train   0.0   451   256  45  109  108   
2         34175         4356   1.0  train   0.0   451   256  45  109  108   
3         34175         2217   0.0  train   0.0   451   256  45  109  108   
4        230783         4818   0.0  train   0.0    54    31  17   20   19   
...         ...          ...   ...    ...   ...   ...   ...  ..  ...  ...   
522336   228478         3111   nan   test   0.0  2004  1173  71  278  282   
522337    97918         2341   nan   test   0.0    55    29  14   17   17   
522338    97918         3971   nan   test   0.0    55    29  14   17   17   
522339    32638         3536   nan   test   0.0    72    46  24   33   35   
522340    32638         3319   nan   test   0.0    72    46  24   33   35   

        ...  age_4  age_5  age_6  age_7  age_8  g_0  g_1  g_2  \
0       ..

In [14]:
# 截取，不缺到定长M个
M=500
for feature in ['hist_merchant_id','hist_action_type']:
    matrix[feature]=matrix[feature].map(lambda x:np.array(x+[0]*(M-len(x)))[:M])

# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
print(train_X)

        user_id  merchant_id  prob   u1   u2  u3   u4   u5        u6     u7  \
0         34175         3906   0.0  451  256  45  109  108  5.833333  410.0   
1         34175          121   0.0  451  256  45  109  108  5.833333  410.0   
2         34175         4356   0.0  451  256  45  109  108  5.833333  410.0   
3         34175         2217   0.0  451  256  45  109  108  5.833333  410.0   
4        230783         4818   0.0   54   31  17   20   19  5.166667   47.0   
...         ...          ...   ...  ...  ...  ..  ...  ...       ...    ...   
260859   359806         4325   0.0  117   49  25   33   32  1.850000  107.0   
260860   294526         3971   0.0  198   89  20   38   37  1.766667  162.0   
260861   294526          152   0.0  198   89  20   38   37  1.766667  162.0   
260862   294526         2537   0.0  198   89  20   38   37  1.766667  162.0   
260863   229246         4140   0.0  194  127  29   50   49  5.916667  181.0   

        ...  age_4  age_5  age_6  age_7  age_8  g_0

In [15]:
# 使用DIN模型
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat,get_feature_names
from deepctr.models import DIN, DIEN, DSIN
from sklearn.metrics import classification_report

In [16]:
train_X['action_type']=3
feature_columns = []
for column in train_X.columns:
  if column != 'hist_merchant_id' and column != 'hist_action_type':
    print(column)
    num = train_X[column].nunique()
    if num > 10000:
        dim = 10
    else:
        if num > 1000:
            dim = 8
        else:
            dim = 4
    print(num)
    if column  == 'user_id':
        feature_columns += [SparseFeat(column, 424169+1, embedding_dim=dim)]
    elif column  == 'merchant_id':
        feature_columns += [SparseFeat(column, 5007+1, embedding_dim=dim)]
    elif column  == 'action_type':
        feature_columns += [SparseFeat(column, 4+1, embedding_dim=dim)]
    else:
        feature_columns += [DenseFeat(column, 1)]

print(train_X['hist_merchant_id'].shape)
len(train_X['hist_merchant_id'])

user_id
212062
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
merchant_id
1993
prob
1
u1
1793
u2
1117
u3
193
u4
425
u5
418
u6
186
u7
1704
u8
23
u9
133
u10
351
m1
1855
m2
1596
m3
562
m4
92
m5
53
m6
1822
m7
130
m8
1008
m9
879
m10
1360
um1
360
um2
184
um3
31
um4
19
um5
359
um6
12
um7
10
um8
52
um9
185
r1
12521
r2
1988
r3
1018
age_0
2
age_1
2
age_2
2
age_3
2
age_4
2
age_5
2
age_6
2
age_7
2
age_8
2
g_0
2
g_1
2
g_2
2
action_type
1
(260864,)


260864

In [17]:
print('M=', M)

M= 500


In [18]:
# maxlen为历史信息的长度，vocabulary_size为onehot的长度
feature_columns += [VarLenSparseFeat(SparseFeat('hist_merchant_id',vocabulary_size=5007+1, embedding_dim=8, embedding_name='merchant_id'),maxlen=M),
                   VarLenSparseFeat(SparseFeat('hist_action_type',  vocabulary_size=4+1, embedding_dim=4, embedding_name='action_type'),maxlen=M)]
hist_features=['merchant_id','action_type']
print(feature_columns)

# 使用DIN模型
model=DIN(feature_columns, hist_features)
# 使用Adam优化器，二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])

# 组装train_model_input，得到feature names，将train_X转换为字典格式
feature_names=list(train_X.columns)
train_model_input = {name:train_X[name].values for name in feature_names}
# histroy输入必须是二维数组
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(train_model_input[fea]):
        l.append(i)
    train_model_input[fea]=np.array(l)
history = model.fit(train_model_input, train_y.map(float), verbose=True, epochs=10, validation_split=0.2,batch_size=512)

# 转换test__model_input
test_data['action_type']=3
test_model_input = {name:test_data[name].values for name in feature_names}
from tqdm import tqdm
for fea in ['hist_merchant_id','hist_action_type']:
    l = []
    for i in tqdm(test_model_input[fea]):
        l.append(i)
    test_model_input[fea]=np.array(l)

# 得到预测结果
prob = model.predict(test_model_input)
submission['prob'] = prob
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('prediction_GIN.csv', index=False)

[SparseFeat(name='user_id', vocabulary_size=424170, embedding_dim=10, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f45232e5490>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='merchant_id', vocabulary_size=5008, embedding_dim=8, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x7f45225bd790>, embedding_name='merchant_id', group_name='default_group', trainable=True), DenseFeat(name='prob', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u1', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u2', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u3', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u4', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u5', dimension=1, dtype='float32', transform_fn=None), DenseFeat(name='u6', d

100%|██████████| 260864/260864 [00:00<00:00, 727902.93it/s]
100%|██████████| 260864/260864 [00:00<00:00, 1232405.23it/s]


Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


100%|██████████| 261477/261477 [00:00<00:00, 1116830.38it/s]
100%|██████████| 261477/261477 [00:00<00:00, 2426143.76it/s]
