# 天猫用户复购预测

**天池比赛地址**

https://tianchi.aliyun.com/competition/entrance/231576/introduction

# 数据探索

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 训练集

**user_id**: A unique id for the shopper.

**merchant_id**: A unique id for the merchant.

**label**: It is an enumerated type {0, 1}, 
* 1 : for repeat buyer
* 0 : for non-repeat buyer
* empty: for test data.

In [2]:
train_raw = pd.read_csv('./train_format1.csv')
train_raw.head(5)

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


In [3]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int64
 1   merchant_id  260864 non-null  int64
 2   label        260864 non-null  int64
dtypes: int64(3)
memory usage: 6.0 MB


In [4]:
train_raw['label'].value_counts()

0    244912
1     15952
Name: label, dtype: int64

## 用户信息表

**user_id**：A unique id for the shopper.

**age_range**
* 1 for <18; 
* 2 for [18,24]; 
* 3 for [25,29]; 
* 4 for [30,34]; 
* 5 for [35,39]; 
* 6 for [40,49]; 
* 7 for >= 50 
* 8 for >= 50 
* NULL for unknown.
* 0 for unknown

**gender**
* 0 for female
* 1 for male
* 2 and NULL for unknown.

In [5]:
user_info_raw = pd.read_csv('./user_info_format1.csv')
user_info_raw.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [6]:
user_info_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int64  
 1   age_range  421953 non-null  float64
 2   gender     417734 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 9.7 MB


In [7]:
user_info_raw.isnull().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

## 用户行为表

**user_id**：A unique id for the shopper.

**item_id**：A unique id for the item.

**cat_id**：A unique id for the category that the item belongs to.

**seller_id**：A unique id for the merchant.

**brand_id**：A unique id for the brand of the item.

**time_tamp**：Date the action took place (format: mmdd)

**action_type**：It is an enumerated type {0, 1, 2, 3}
* 0 is for click
* 1 is for add-to-cart
* 2 is for purchase 
* 3 is for add-to-favourite.

In [8]:
user_log_raw = pd.read_csv('./user_log_format1.csv')
user_log_raw.rename(columns={'seller_id': 'merchant_id'}, inplace=True)
user_log_raw.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [9]:
user_log_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int64  
 3   merchant_id  int64  
 4   brand_id     float64
 5   time_stamp   int64  
 6   action_type  int64  
dtypes: float64(1), int64(6)
memory usage: 2.9 GB


In [10]:
user_log_raw.isnull().sum()

user_id            0
item_id            0
cat_id             0
merchant_id        0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64

## 上传数据格式

In [11]:
# 上传数据格式
sub = pd.read_csv('./test_format1.csv')
sub.head(5)

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,
1,360576,1581,
2,98688,1964,
3,98688,3645,
4,295296,3361,


# 数据清洗

In [12]:
# 填充缺失值
user_info_raw.fillna(value={'age_range': 0, 'gender': 2}, inplace=True)
user_log_raw.fillna(value={'brand_id': 0.0}, inplace=True)

# 时间格式更改
user_log_raw['time_stamp'] = pd.to_datetime(user_log_raw['time_stamp'], format='%m%d')

# 测试集增加标记
sub['label'] = 3
sub_drop = sub.drop(labels='prob', axis=1)

# 机器学习模型

## 特征构造

In [13]:
def build_features(train, test, user_info, activity_log):

    # --- creat return set -----------------------------------------------
    df = pd.concat([train, test], axis=0)
    df.sort_values('user_id', inplace=True)
    
    
    
    # --- user featres ----------------------------------------------------
    user_group =  activity_log.groupby('user_id')
    
    ## 年龄与性别 one-hot 编码
    cols = ['age_range', 'gender']
    df = pd.merge(df, user_info, how='left', on='user_id')
    df = pd.get_dummies(df, columns=cols)
    
    ## 用户交互行为数量
    df_temp = user_group.size().reset_index().rename(columns={0: 'user_log_count'})
    df = pd.merge(df, df_temp, how='left', on='user_id')
    
    ## 分别对item_id, cat_id, merchant_id, brand_id, action_type 的唯一值个数
    cols = ['item_id', 'cat_id', 'merchant_id', 'brand_id', 'action_type']
    for col in cols:
        df_temp = user_group.agg({col: 'nunique'})
        df_temp.rename(columns={col: 'user_{}_nunique'.format(col)}, inplace=True)
        df = pd.merge(df, df_temp, how='left', on='user_id')
    
    ## 分别对每种 action_type 的次数统计
    df_temp = user_group['action_type'].value_counts().unstack().reset_index()
    df_temp.rename(columns={x: 'user_action_type_{}'.format(x) for x in range(4)}, inplace=True)
    df_temp.fillna(value=0, inplace=True)
    df = pd.merge(df, df_temp, how='left', on='user_id')
    
    ## 行为最迟时间与最早时间的差值
    df_max = user_group.agg({'time_stamp': 'max'})
    df_min = user_group.agg({'time_stamp': 'min'})
    df_temp = (df_max - df_min).reset_index()
    df_temp['time_stamp'] = df_temp['time_stamp'].apply(lambda x: x.days)
    df = pd.merge(df, df_temp, how='left', on='user_id')
    
    ## 用户购买点击比
    df['user_buy_click_ratio'] = df['user_action_type_2'] / df['user_action_type_0']
    df['user_buy_click_ratio'] = df['user_buy_click_ratio'].apply(lambda x: x if x != np.inf else 1)
    
    ## 用户活跃程度，行为次数/时间差
    df['user_activity_ratio'] = df['user_log_count'] / df['time_stamp']
    
    ## 用户忠诚程度，包括item、cat、merchant、brand用户行为除以这些的唯一值，值越大越忠诚
    cols = ['item_id', 'cat_id', 'merchant_id', 'brand_id']
    for col in cols:
        new_col = 'user_{}_activity_ratio'.format(col)
        nunique_col = 'user_{}_nunique'.format(col)
        df[new_col] = df['user_log_count'] / df[nunique_col]
    
    
    
    
    # --- merchant features -------------------------------------------------
    merchant_group = activity_log.groupby('merchant_id')
    
    ## 商家被交互行为数量
    df_temp = merchant_group.size().reset_index().rename(columns={0: 'merchant_log_count'})
    df = pd.merge(df, df_temp, how='left', on='merchant_id')
    
    ## 统计商家拥有的item_id, cat_id, user_id, brand_id, action_type的唯一值个数
    cols = ['item_id', 'user_id', 'cat_id', 'brand_id', 'action_type']
    for col in cols:
        df_temp = merchant_group.agg({col: 'nunique'})
        df_temp.rename(columns={col: 'merchant_{}_nunique'.format(col)}, inplace=True)
        df = pd.merge(df, df_temp, how='left', on='merchant_id')
    
    ## 商家被交互的 action_typed 的唯一值分别次数
    df_temp = merchant_group['action_type'].value_counts().unstack().reset_index()
    df_temp.rename(columns={x: 'merchant_action_type_{}'.format(x) for x in range(4)}, inplace=True)
    df_temp.fillna(value=0, inplace=True)
    df = pd.merge(df, df_temp, how='left', on='merchant_id')
    
    ## 商家购买点击比
    df['merchant_buy_click_ratio'] = df['merchant_action_type_2'] / df['merchant_action_type_0']
    df['merchant_buy_click_ratio'] = df['merchant_buy_click_ratio'].apply(lambda x: x if x<=1 else 1)
    
    ## 商家的item_id, cat_id, user_id, brand_id的丰富程度，与整体最大值的比例
    cols = ['item_id', 'cat_id', 'user_id', 'brand_id']
    for col in cols:
        new_col_name = 'merchant_{}_ratio'.format(col)
        nunique_col_name = 'merchant_{}_nunique'.format(col)
        df[new_col_name] = df[nunique_col_name] / df[nunique_col_name].max()

    
    
    
    #--- user merchant activites ---------------------------------------------
    user_merchant_group = activity_log.groupby(['user_id', 'merchant_id'])
    
    # user在当前的merchant的item_id, cat_id, brand_id, action_type的唯一值个数
    cols = ['item_id', 'cat_id', 'brand_id', 'action_type']
    for col in cols:
        df_temp = user_merchant_group.agg({col: 'nunique'})
        df_temp.rename(columns={col: 'user_merchant_{}_nunique'.format(col)}, inplace=True)
        df = pd.merge(df, df_temp, how='left', on=['user_id', 'merchant_id'])
    
    # 统计user在当前的merchant交互次数
    df_temp = user_merchant_group.size().reset_index().rename(columns={0: 'user_merchant_log_count'})
    df = pd.merge(df, df_temp, how='left', on=['user_id', 'merchant_id'])
    
    # user在当前meichant交互的 action_typed 的唯一值分别次数
    df_temp = user_merchant_group['action_type'].value_counts().unstack().reset_index()
    df_temp.rename(columns={x: 'user_merchant_action_type_{}'.format(x) for x in range(4)}, inplace=True)
    df_temp.fillna(value=0, inplace=True)
    df = pd.merge(df, df_temp, how='left', on=['user_id', 'merchant_id'])
    
    ## user在当前merchant的购买点击比
    df['user_merchant_buy_click_ratio'] = df['user_merchant_action_type_2'] / df['user_merchant_action_type_0']
    df['user_merchant_buy_click_ratio'] = df['user_merchant_buy_click_ratio'].apply(lambda x: x if x<=1 else 1)
    

    
    
    # --- merchant item activites ---------------------------------------------
    
    # --- split df ------------------------------------------------------------
    train = df[(df['label']==1) | (df['label']==0)].drop('label', axis=1)
    label = df[(df['label']==1) | (df['label']==0)]['label']
    test = df.query('label == 3').drop('label', axis=1)
    
    return train, label, test

In [14]:
# 获取特征数据
train_set, label_set, test_set = build_features(train_raw, sub_drop, user_info_raw, user_log_raw)
print('train_set shape', train_set.shape)
print('label_set shape', label_set.shape)
print('test_set shape', test_set.shape)

train_set shape (260864, 56)
label_set shape (260864,)
test_set shape (261477, 56)


## 数据规范化

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
# 删除 user_id 和 merchant_id
features_col = [x for x in train_set.columns if x not in ['user_id', 'merchant_id']]
x_train = train_set[features_col]
x_test = test_set[features_col]
y_train = label_set

# 数据规范化
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

## 模型训练

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr = LogisticRegression(max_iter=2000)
lr.fit(x_train, y_train)
predict_proba = lr.predict_proba(x_test)

In [19]:
sub_set = test_set.copy()
sub_set['prob'] = predict_proba[:,1]
result = pd.merge(sub[['user_id', 'merchant_id']], sub_set[['user_id', 'merchant_id', 'prob']], how='left', on=['user_id', 'merchant_id'])
result.to_csv('./lr_result.csv', index=False)

score: 0.6326483

In [20]:
from lightgbm import LGBMClassifier

In [21]:
lgb = LGBMClassifier(n_estimators=300)
lgb.fit(x_train, y_train)
predict_proba = lgb.predict_proba(x_test)

In [22]:
sub_set = test_set.copy()
sub_set['prob'] = predict_proba[:,1]
result = pd.merge(sub[['user_id', 'merchant_id']], sub_set[['user_id', 'merchant_id', 'prob']], how='left', on=['user_id', 'merchant_id'])
result.to_csv('./lgb_result.csv', index=False)

score: 0.6723456

In [23]:
from xgboost import XGBClassifier

In [24]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
predict_proba = xgb.predict_proba(x_test)

In [25]:
sub_set = test_set.copy()
sub_set['prob'] = predict_proba[:,1]
result = pd.merge(sub[['user_id', 'merchant_id']], sub_set[['user_id', 'merchant_id', 'prob']], how='left', on=['user_id', 'merchant_id'])
result.to_csv('./xgb_result.csv', index=False)

score: 0.6673236

# 深度模型

## 特征构造
在机器学习特征数据基础上，增加 merchant 和 action_type 的 sequence

In [26]:
def add_din_features(train_set, test_set, activity_log):
    
    # 对每个 user 生成 sequence 的 list
    user_group = activity_log.groupby('user_id')
    temp = user_group[['merchant_id','action_type']].agg(lambda x: list(x))
    temp.columns = ['merchant_id_seq','action_type_seq']
    
    # action_type 映射（留出0作为补全长度用）
    type_to_idx = {0:1, 1:2, 2:3, 3:4}
    temp['action_type_seq'] = temp['action_type_seq'].apply(lambda x: [type_to_idx[y] for y in x])
    
    # 固定两个 sequence 长度为500，不足的在后面补0
    length = 500
    for col in temp.columns:
        temp[col] = temp[col].apply(lambda x: (x + [0]*length)[:length])
        
    # 默认所有用户 action_type 为3
    train_set['action_type'], test_set['action_type'] = 3, 3
    
    # 与原来数据合并
    temp = temp.reset_index()    
    train = pd.merge(train_set, temp, how='left', on='user_id')
    test = pd.merge(test_set, temp, how='left', on='user_id')
    
    # train里有seq空白
    train.dropna(axis=0, inplace=True)
    
    return train, test

In [27]:
# 获得新数据并查看维度
din_train_set, din_test_set  = add_din_features(train_set, test_set, user_log_raw)

print('din_train_set shape', din_train_set.shape)
print('din_test_set shape', din_test_set.shape)

din_train_set shape (260864, 59)
din_test_set shape (261477, 59)


## DIN 模型构建
在这一步一直报错：model_din = DIN(features_col, sequence_features)  # 这一步一直处理不到的bug

一直提醒‘tuple index out of range’，没有查出原因，故此部分没有完成

In [28]:
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat,get_feature_names

In [29]:
# 方便计算唯 vocabulary_size
temp_set = din_train_set.append(din_test_set)
temp_set.shape

(522341, 59)

In [30]:
# 离散特征列构造
sparse_col =[
    SparseFeat('user_id', 424170+1, embedding_dim=10),
    SparseFeat('merchant_id', 49959+1, embedding_dim=10),
    SparseFeat('action_type', 4+1, embedding_dim=4),
]

# 连续特征列构造
dense_col = [DenseFeat(x, 1) for x in din_train_set.columns if x not in ['user_id', 'merchant_id', 'merchant_id_seq', 'action_type_seq', 'action_type']]

# 时序特征列构造
varlen_col = [VarLenSparseFeat(
    SparseFeat('merchant_id_seq', vocabulary_size=49959+1, embedding_dim=8, embedding_name='merchant_id'), maxlen=500),
              VarLenSparseFeat(
    SparseFeat('action_type_seq', vocabulary_size=4+1, embedding_dim=8, embedding_name='action_type'), maxlen=500)
]

# 整合以上特征列
features_col = sparse_col + dense_col + varlen_col

# 时序的列名
sequence_features = ['action_type', 'merchant_id']

# 模型定义  feature_columns
model_din = DIN(features_col, sequence_features)  # 这一步一直处理不到的bug
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


IndexError: tuple index out of range

## DIN 模型训练

In [None]:
# 输入格式为字典形式
train_input = {x: train_set[x].values for x in train_set.columns}
test_input = {x: test_set[x].values for x in test_set.columns}
label_input = label_set.values

# 模型训练
history = model_din.fit(train_input, label_input)

# 模型预测
proba = model.predict(test_input)