In [1]:
import pandas as pd
import numpy as np

test_group_data = pd.read_csv('./data/test_format1.csv')
train_group_data = pd.read_csv('./data/train_format1.csv')
user_info_data = pd.read_csv('./data/user_info_format1.csv')
user_log_data = pd.read_csv('./data/user_log_format1.csv').rename(columns={'seller_id': 'merchant_id'})
user_log_data['brand_id'] = user_log_data['brand_id'].astype(int)

In [2]:
user_info_data.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [3]:
user_log_data.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [4]:
train_group_data.head()

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


## 特征工程
第一，将time_stamp字段补全为完整的时间格式，并基于这个时间戳转换成季度，月，周，日的信息。

In [5]:
import datetime
user_log_data['time_stamp'] = [datetime.date(2015, int(str(i)[:-2]), int(str(i)[-2:]))
                                    for i in user_log_data['time_stamp']]
user_log_data['weekday'] = [i.weekday() for i in user_log_data['time_stamp']]
user_log_data['month'] = [i.month for i in user_log_data['time_stamp']]
user_log_data['season'] = ['summer' if i in [5, 6, 7] else 'autumn' for i in user_log_data['month']]
user_log_data['day'] = [i.day for i in user_log_data['time_stamp']]
# 以2015年一月一号为起点，计算出当前时间戳距离这个起点的日数
user_log_data['dayCount'] = [(i - datetime.date(2015, 1, 1)).days for i in user_log_data['time_stamp']]

第二，构造出用户的行为特征。

In [27]:
# 生成最热门的商品品类列表，详见附录-2
def gene_most_buy_labels():
    most_100_labels_list = []
    for target_column in ['item_id', 'cat_id', 'merchant_id', 'brand_id']:
        target_data = user_log_data[user_log_data['action_type'] == 2]
        most_100_labels_1 = target_data.groupby(target_column)['user_id'].count().sort_values().tail(100).index
        most_100_labels_2 = target_data.groupby(target_column)['user_id'].unique().apply(lambda x: len(x)).sort_values().tail(100).index
        most_100_labels = set(list(most_100_labels_1) + list(most_100_labels_2))
        most_100_labels_list.append(most_100_labels)
    return most_100_labels_list
most_100_labels_list = gene_most_buy_labels()

In [116]:
from tqdm import tqdm
user_feature_record_map = {}
user_concern_cat_list = {}
user_concern_brand_list = {}
total_user_id_set = set(train_group_data['user_id']) | set(test_group_data['user_id'])

user_feature_name = [('用户年龄', 'i'), ('用户性别', 'i'), ('用户点击数量', 'f'), ('用户加购数量', 'f'), ('用户购买数量', 'f'), ('用户收藏数量', 'f'),
                        ('用户点击转化率', 'f'), ('用户加购转化率', 'f'), ('用户收藏转化率', 'f')] + \
                        [('用户周计数活跃数量-%d' % i, 'f') for i in range(7)] + [('用户周计数活跃比率-%d' % i, 'f') for i in range(7)] +\
                        [('用户月计数活跃数量-%d' % i, 'f') for i in range(5,12)] + [('用户月计数活跃比率-%d' % i, 'f') for i in range(5,12)] +\
                        [('用户双11期间活跃数量-%d' % i, 'f') for i in range(5,12)] + [('用户双11期间活跃比率-%d' % i, 'f') for i in range(5,12)] +\
                        [('用户涉及的商品数量','f'), ('用户涉及的品类数量', 'f'), ('用户涉及的商家数量', 'f'), ('用户涉及的品牌数量', 'f')] + \
                        [('用户复购-%s-%s' % (i,j), 'f') for i in ['商品', '品类', '店铺', '品牌'] for j in ['类数量', '总数量', '比例']] +\
                        [('用户双11期间购买量占比', 'f'), ('用户-商品交互平均间隔', 'f'), ('用户-品类交互平均间隔', 'f'), ('用户-商家交互平均间隔', 'f'),
                        ('用户-品牌交互平均间隔', 'f')] + [('用户-热门商品交互数量', 'f'), ('用户-热门品类交互数量', 'f'), ('用户-热门店铺交互数量', 'f'),
                        ('用户-热门品牌交互数量', 'f')]

# 暂时先处理200个用户作为案例，全部用户的处理需要大约7个小时
for user_id in tqdm(list(total_user_id_set)[:200]):
    #print('正在处理用户：', user_id)
    user_id_data = user_log_data[user_log_data['user_id'] == user_id]
    # 该用户的年龄和性别
    _, user_age, user_gender = user_info_data[user_info_data['user_id']==user_id].values[0]
    # 该用户的总点击行为，加购行为，购买行为和收藏行为数量
    action_type_count = user_id_data['action_type'].value_counts()
    user_action_click = action_type_count.get(0, 0)
    user_action_toCart = action_type_count.get(1, 0)
    user_action_purchase = action_type_count.get(2, 0)
    user_action_collect = action_type_count.get(3, 0)
    # 该用户的点击转化率，加购转化率和收藏转化率
    user_action_click_to_purchase_rate = user_action_click / user_action_purchase
    user_action_toCart_to_purchase_rate = user_action_toCart / user_action_purchase
    user_action_collect_to_purchase_rate = user_action_collect / user_action_purchase
    # 计算出该用户在一周7天的活跃次数以及比例情况，这其中可能反应用户的作息规律和职业特性
    weekday_active_count = user_id_data.groupby('weekday')['user_id'].count()
    user_active_count_in_week = [weekday_active_count.get(i, 0) for i in range(7)]
    user_active_ratio_in_week = [i / max(1, sum(user_active_count_in_week)) for i in user_active_count_in_week]
    # 计算出该用户在5,6,7,8,9,10,11月分别的活跃次数和活跃比率
    month_active_count = user_id_data.groupby('month')['user_id'].count()
    user_active_count_in_mouth = [month_active_count.get(i, 0) for i in [5, 6, 7, 8, 9, 10, 11]]
    user_active_ratio_in_mouth = [i / max(1, sum(user_active_count_in_mouth)) for i in user_active_count_in_mouth]
    # 计算出消费者在双十一活动前n天的活跃次数和活跃比率（这里的n需要根据业务经验来确定）
    # 你来完成这个指标的计算
    user_active_in_11_span = []
    for date in range(5, 12):
        date_active_num = user_id_data[user_id_data['time_stamp'] == datetime.date(2015, 11, date)]['user_id'].count()
        user_active_in_11_span.append(date_active_num)
    user_active_in_11_span_ratio = [i/max(1, sum(user_active_in_11_span)) for i in user_active_in_11_span]
    # 消费者行为涉及的具体商品个数（这里的消费者行为包括全部的action_type）
    user_touch_item_num = len(set(user_id_data['item_id'].values))
    # 消费者行为涉及的类别个数
    user_touch_cat_num = len(set(user_id_data['cat_id'].values))
    # 消费者行为涉及的商家个数
    user_touch_merchant_num = len(set(user_id_data['merchant_id'].values))
    # 消费者行为涉及的品牌个数
    user_touch_brand_num = len(set(user_id_data['brand_id'].values))
    
    
    # 消费者复购的商品个数，类别个数，商家个数，品牌个数
    # 首先把该用户的全部购买样本筛选出来
    user_id_purchase_data = user_id_data[user_id_data['action_type']==2]
    # 双11期间购买次数占总购买次数的比例
    user_buy_in_11_ratio = len(user_id_purchase_data[user_id_purchase_data['time_stamp'] > datetime.date(2015, 11, 10)])/len(user_id_purchase_data)

    def temp_assist_func(column_name):
        # 将该用户的全部购买样本按照某一个指标进行分组计数
        c_count_map = user_id_purchase_data.groupby(column_name)['user_id'].count()
        c_buy_2up_map = c_count_map[c_count_map>1]
        # 购买次数大于1次的品类数量
        c_num = len(c_buy_2up_map)
        # 各品类的总复购次数
        c_rebuy_total_num = sum(c_buy_2up_map) - c_num
        # 复购率
        c_rebuy_ratio = c_rebuy_total_num / max(1, sum(c_count_map))
        return c_num, c_rebuy_total_num, c_rebuy_ratio
    
    # 消费者重复购买商品的统计情况
    user_rebuy_item_cate_num, user_rebuy_item_total_num, user_rebuy_item_ratio = temp_assist_func('item_id')
    # 消费者重复购买品类的统计情况
    user_rebuy_cat_cate_num, user_rebuy_cat_total_num, user_rebuy_cat_ratio = temp_assist_func('cat_id')
    # 消费者重复购买店铺的统计情况
    user_rebuy_mer_cate_num, user_rebuy_mer_total_num, user_rebuy_mer_ratio = temp_assist_func('merchant_id')
    # 消费者重复购买品牌的统计情况
    user_rebuy_brand_cate_num, user_rebuy_brand_total_num, user_rebuy_brand_ratio = temp_assist_func('brand_id')
    # 消费者交互行为的间隔天数波动水平
    def temp_assist_func_2(column_name):
        # 该操作的说明见附录-1
        c_std_mean = user_id_purchase_data.groupby(column_name)['dayCount'].std().fillna(0).mean()
        return c_std_mean
    user_react_item_days_std = temp_assist_func_2('item_id')
    user_react_cat_days_std = temp_assist_func_2('cat_id')
    user_react_mer_days_std = temp_assist_func_2('merchant_id')
    user_react_brand_days_std = temp_assist_func_2('brand_id')
    
    # 消费者关注过热门商品
    user_concern_hot_item = len(set(user_id_data['item_id'].values) & most_100_labels_list[0])
    # 消费者关注过热门品类
    user_concern_hot_cat = len(set(user_id_data['cat_id'].values) & most_100_labels_list[1])
    # 消费者关注过热门店铺
    user_concern_hot_mer = len(set(user_id_data['merchant_id'].values) & most_100_labels_list[2])
    # 消费者关注过热门品牌
    user_concern_hot_brand = len(set(user_id_data['brand_id'].values) & most_100_labels_list[3])
    
    # 消费者关注的品类列表
    user_concern_cat_list[user_id] = list(user_id_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-20:])
    # 消费者关注的品牌列表
    user_concern_brand_list[user_id] = list(user_id_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-20:])
    
    
    user_feature = [user_age, user_gender, user_action_click, user_action_toCart, user_action_purchase, user_action_collect,
                   user_action_click_to_purchase_rate, user_action_toCart_to_purchase_rate, user_action_collect_to_purchase_rate,
                   ] + \
                   user_active_count_in_week + user_active_ratio_in_week + \
                   user_active_count_in_mouth + user_active_ratio_in_mouth + \
                   user_active_in_11_span + user_active_in_11_span_ratio + \
                   [user_touch_item_num, user_touch_cat_num, user_touch_merchant_num, user_touch_brand_num] + \
                   [user_rebuy_item_cate_num, user_rebuy_item_total_num, user_rebuy_item_ratio, 
                   user_rebuy_cat_cate_num, user_rebuy_cat_total_num, user_rebuy_cat_ratio,
                   user_rebuy_mer_cate_num, user_rebuy_mer_total_num, user_rebuy_mer_ratio,
                   user_rebuy_brand_cate_num, user_rebuy_brand_total_num, user_rebuy_brand_ratio] + \
                   [user_buy_in_11_ratio, user_react_item_days_std, user_react_cat_days_std, user_react_mer_days_std, user_react_brand_days_std] +\
                   [user_concern_hot_item, user_concern_hot_cat, user_concern_hot_mer, user_concern_hot_brand]
    
    user_feature_record_map[user_id] = user_feature

user_feature_dataframe = pd.DataFrame(user_feature_record_map.values(), columns=[i[0] for i in user_feature_name])

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:12<00:00, 15.61it/s]


In [112]:
user_feature_dataframe.head(20)

Unnamed: 0,用户年龄,用户性别,用户点击数量,用户加购数量,用户购买数量,用户收藏数量,用户点击转化率,用户加购转化率,用户收藏转化率,用户周计数活跃数量-0,...,用户复购-品牌-比例,用户总体复购比例,用户-商品交互平均间隔,用户-品类交互平均间隔,用户-商家交互平均间隔,用户-品牌交互平均间隔,用户-热门商品交互数量,用户-热门品类交互数量,用户-热门店铺交互数量,用户-热门品牌交互数量
0,3.0,1.0,27,0,6,0,4.5,0.0,0.0,0,...,0.5,0.666667,0.0,0.0,0.0,0.0,0,5,0,0
1,3.0,0.0,47,0,14,2,3.357143,0.0,0.142857,20,...,0.428571,0.5,0.0,0.0,0.0,0.0,1,12,2,3
2,3.0,0.0,63,0,4,1,15.75,0.0,0.25,27,...,0.25,0.25,0.0,0.0,0.0,0.0,0,12,1,2
3,0.0,0.0,49,0,1,0,49.0,0.0,0.0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0,13,1,2
4,5.0,0.0,150,0,13,10,11.538462,0.0,0.769231,11,...,0.166667,0.230769,0.0,0.0,0.0,0.0,2,35,10,14
5,4.0,0.0,217,0,17,15,12.764706,0.0,0.882353,65,...,0.235294,0.176471,1.414214,1.414214,1.532065,1.414214,3,29,8,11
6,4.0,0.0,6,0,8,0,0.75,0.0,0.0,0,...,0.625,1.0,0.0,0.0,0.235702,0.235702,0,3,1,2
7,4.0,0.0,61,0,23,7,2.652174,0.0,0.304348,16,...,0.6,0.043478,2.532358,4.693725,4.933384,1.781742,2,8,3,7
8,5.0,0.0,79,0,4,4,19.75,0.0,1.0,18,...,0.0,0.5,0.0,0.0,0.0,0.0,1,18,0,1
9,4.0,0.0,56,0,7,1,8.0,0.0,0.142857,1,...,0.714286,0.285714,0.0,0.0,0.0,0.0,1,12,1,1


第三，构建店铺的特征。

In [53]:
total_merchant_id_set = set(train_group_data['merchant_id']) | set(test_group_data['merchant_id'])
merchant_feature_record_map = {}
merchant_sell_cat_list = {}
merchant_sell_brand_list = {}
for merchant_id in tqdm(list(total_merchant_id_set)[:200]):
    merchant_id_data = user_log_data[user_log_data['merchant_id'] == merchant_id]
    
    merchant_action_count_map = merchant_id_data.groupby('action_id')['user_id'].count()
    # 店铺总体的点击数量
    merchant_action_click_num = merchant_action_count_map.get(0, 0)
    # 店铺总体的加购数量
    merchant_action_toCart_num = merchant_action_count_map.get(1, 0)
    # 店铺总体的购买数量
    merchant_action_purchase_num = merchant_action_count_map.get(2, 0)
    # 店铺总体的收藏数量
    merchant_action_collect_num = merchant_action_count_map.get(3, 0)
    
    # 店铺的购买比加购系数
    merchant_action_purchase_div_toCart = merchant_action_purchase_num / max(1, merchant_action_toCart_num)
    # 店铺的收藏比购买系数
    merchant_action_collect_div_purchase = merchant_action_collect_num / max(1, merchant_action_purchase_num)
    # 店铺的加购比点击系数
    merchant_action_toCart_div_click = merchant_action_toCart_num / max(1, merchant_action_click_num)
    
    # 店铺中的商品总数
    merchant_store_item_num = len(merchant_id_data['item_id'].unique())
    # 店铺中的品类总数
    merchant_store_cat_num = len(merchant_id_data['cat_id'].unique())
    # 店铺中的品牌总数
    merchant_store_brand_num = len(merchant_id_data['brand_id'].unique())
    # 店铺中的潜在消费者总数
    merchant_store_user_num = len(merchant_id_data['user_id'].unique())
    # 店铺中的实际消费者总数
    merchant_store_user_num = len(merchant_id_data[merchant_id_data['action_type'] == 2]['user_id'].unique())
    # 店铺中的商品上架混乱程度
    merchant_store_entropy_coef = merchant_store_item_num / np.sqrt(merchant_store_cat_num * merchant_store_brand_num)
    # 店铺中的商品关注混乱程度
    merchant_concern_entropy_coef_item = merchant_id_data.groupby('item_id')['user_id'].count().std()
    # 店铺中的品类关注混乱程度
    merchant_concern_entropy_coef_cat = merchant_id_data.groupby('cat_id')['user_id'].count().std()
    # 店铺中的品牌关注混乱程度
    merchant_concern_entropy_coef_brand = merchant_id_data.groupby('brand_id')['user_id'].count().std()
    
    # 店铺总的重复购买消费者人数
    merchant_id_purchase_data = merchant_id_data[merchant_id_data['action_type'] == 2]
    temp_a_data = merchant_id_purchase_data.groupby('user_id')['user_id'].count()
    merchant_rebuy_user_num = temp_a_data[temp_a_data > 1].count()
    merchant_rebuy_user_ratio = merchant_rebuy_user_num / temp_a_data.count()
    # 店铺中商品被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'item_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_item_ratio = b.count() / max(1, a.count())
    # 店铺中品类被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'cat_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_cat_ratio = b.count() / max(1, a.count())
    # 店铺中品牌被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'brand_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_brand_ratio = b.count() / max(1, a.count())
    
    # 店铺主要销售的品类(数量最大的三个,如果不够三个的话，就将最大的复制，补全3个)
    merchant_main_cat = merchant_id_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-3:]
    merchant_main_cat = list(merchant_main_cat[::-1]) + [merchant_main_cat[0] for i in range(3 - len(merchant_main_cat))]
    # 店铺主要销售的品牌(数量最大的三个,如果不够三个的话，就将最大的复制，补全3个)
    merchant_main_brand = merchant_id_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-3:]
    merchant_main_brand = list(merchant_main_brand[::-1]) + [merchant_main_brand[0] for i in range(3 - len(merchant_main_brand))]
    
    
    
    # 商家出售的主要品类列表
    merchant_sell_cat_list[user_id] = list(merchant_id_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-30:])
    # 商家出售的主要品牌列表
    merchant_sell_brand_list[user_id] = list(merchant_id_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-30:])
    break
    

1994

### 附录-1: 某一汇总指标下重复交互行为的间隔天数波动情况计算
这里的汇总指标主要是指item_id,cat_id,merchant_id,brand_id等，指消费者重复关注于某一商品，某一类别，某一商家或某一品牌的行为通常会有多长时间的间隔，这反映了消费者是不是有“留恋”习惯。当然，这里没有只使用购买行为，而包含了全部行为在内，是因为购买行为数量较少，样本不具有代表性。以cat_id为例，首先，就该用户的涉及的cat_id对样本进行分组，比如cat_id=737这一品类下共有如下样本：

In [8]:
user_id_data = user_log_data[user_log_data['user_id'] == 502]
user_id_data[user_id_data['cat_id'] == 737]

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,weekday,month,season,day,dayCount
21730474,502,306823,737,3904,3876.0,2015-05-21,0,3,5,summer,21,140
21730482,502,895126,737,1844,8455.0,2015-05-14,2,3,5,summer,14,133
21730496,502,134678,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730497,502,468097,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730499,502,481791,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730500,502,275938,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730501,502,481483,737,2124,7666.0,2015-06-01,2,0,6,summer,1,151
21730502,502,1041982,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730505,502,363585,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730506,502,77939,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151


从该表格可以看出，该用户对该品类的关注在时间上还是很集中的。所以我们可以用dayCount上的标准差来衡量这种波动程度。即：

In [9]:
user_id_data[user_id_data['cat_id'] == 737]['dayCount'].std()

4.605203007125973

在这里例子上，平均4.6天会关注一次这个品类的商品。所以我们把该用户涉及的各个品类上的这一指标都计算出来，即：

In [10]:
user_id_data.groupby('cat_id')['dayCount'].std()

cat_id
177     0.000000
267     0.000000
276          NaN
389          NaN
662     3.130495
737     4.605203
821     0.000000
946     2.828427
1034         NaN
1095         NaN
1188    2.121320
1213         NaN
1280    0.000000
1397    0.000000
1429         NaN
1505    0.000000
Name: dayCount, dtype: float64

`NaN`的位置是说明该cat_id下只有一次交互，即一个样本，不能够计算标准差，其实等价于标准差为0。`0`的位置是说明这里有多次交互，但是在同一天，所以标准差为0。我们将不同cat_id的交互间隔标准差求一个平均值，作为对该用户“留恋行为”的刻画。

In [11]:
user_id_data.groupby('cat_id')['dayCount'].std().fillna(0).mean()

0.79284035274572

从结果来看，该用户对同一品类的平均留恋时间为1.27天左右，即该用户如果在1.27天内重新与该品类进行了交互，那么交互行为就可以继续持续，否则就不会再出现。

### 附录-2：用户是否参与过或购买过热门商品，热门品类，热门店铺或者热门品牌
这一指标可以反映出用户是否具有“跟风特征”。这里的核心难题在于如何判断一个商品或者一个店铺属于热门，有两种最直观的评价标准，第一种是销售量最大，第二种是涉及的用户数量最大。所以我们从这两个角度出发，筛选出销售量最大的一部分商品和受众最多的一部分商品。

In [24]:
# 以item_id为例子
target_column = 'item_id'
# 以用户的购买行为为评价基础
target_data = user_log_data[user_log_data['action_type'] == 2]
# 交互行为最多的100中商品
most_100_labels_1 = target_data.groupby(target_column)['user_id'].count().sort_values().tail(100).index
# 涉及人群最多的100中商品
most_100_labels_2 = target_data.groupby(target_column)['user_id'].unique().apply(lambda x: len(x)).sort_values().tail(100).index
most_100_labels = set(list(most_100_labels_1) + list(most_100_labels_2))

我们从这四个方面都筛选出最热门的，然后判断该用户是否购买过或者关注过这些热门商品或店铺。

### 附录-3：筛选正向品类和负向品类，正向品牌和负向品牌
一般来说，某一些品类或者品牌通常会对复购有明显的正面影响或者负面影响，如果商家在售卖这种品类或者品牌，那么就相应的影响。所以要利用这一点，就需要将正向品类与负向品类识别出来。第一步，将不同品类或品牌的竞争强度计算出来，即出售该品类或者品牌的商家数量就是竞争强度。

In [87]:
cat_compete_score_map = user_log_data.groupby('cat_id')['merchant_id'].unique().apply(lambda x: len(x))
brand_compete_score_map = user_log_data.groupby('brand_id')['merchant_id'].unique().apply(lambda x: len(x))

第二步，根据实际的复购情况，估算出每一个品类或者品牌的复购概率。具体来说，有两种复购概率，第一种是基于用户在双十一之前的历史数据去估计每一个品类的复购概率。第二种是基于用户在双十一之后的复购记录去评估该品类的复购概率。

先来实现第一种复购概率的估计。

In [108]:
# 抽取全部的用户购买记录
user_log_buy_data = user_log_data[user_log_data['action_type'] == 2]
# 按照cat_id和user_id进行分组并计数
a = user_log_buy_data.groupby(['cat_id', 'user_id'])['item_id'].count()
# 重新整理统计数据的格式
index_user_id = [i[1] for i in a.index.values]
index_cat_id = [i[0] for i in a.index.values]
b = pd.DataFrame({'user_id': index_user_id, 'number': a.values, 'cat_id': index_cat_id})
# 统计每一个cat_id下购买的人数
c = b.groupby('cat_id')['user_id'].count()
# 统计每一个cat_id下购买二次及以上的人数
d = b[b['number'] > 1].groupby('cat_id')['user_id'].count()
# 计算出每一个cat_id的复购率
cat_id_rebuy_record_map_1 = d/c

# 同样的计算逻辑，可以得到品牌的复购概率
# 按照brand_id和user_id进行分组并计数
a = user_log_buy_data.groupby(['brand_id', 'user_id'])['item_id'].count()
# 重新整理统计数据的格式
index_user_id = [i[1] for i in a.index.values]
index_brand_id = [i[0] for i in a.index.values]
b = pd.DataFrame({'user_id': index_user_id, 'number': a.values, 'brand_id': index_brand_id})
# 统计每一个cat_id下购买的人数
c = b.groupby('brand_id')['user_id'].count()
# 统计每一个cat_id下购买二次及以上的人数
d = b[b['number'] > 1].groupby('brand_id')['user_id'].count()
# 计算出每一个cat_id的复购率
brand_id_rebuy_record_map_1 = d/c

再来实现第二种复购概率的估计。结果数据中的所谓复购，即label=1指的是双11当天购买过之后，在随后6个月有再次购买的行为。从这一信息中首先可以推算出店铺本身的在未来6个月的复购概率，之前计算的是双11前6个月的复购概率，即：

In [129]:
after_11_merchant_rebuy_ratio = train_group_data.groupby('merchant_id')['label'].mean()
after_11_merchant_rebuy_ratio

merchant_id
2       0.033898
8       0.045455
9       0.032258
10      0.035088
13      0.082873
          ...   
4987    0.000000
4988    0.083333
4991    0.000000
4992    0.150206
4993    0.030534
Name: label, Length: 1987, dtype: float64

如何估计品牌或者品类在未来6个月的复购概率呢？一种符合直接的方法是通过猜测出结果数据集中消费者和商家的那次交互对于的品类和品牌，为了实现这一目标，（1）先推断出双11当天该消费者在该店铺购买的品牌或者品类；（2）根据品类或者品牌将结果数据进行分组，并统计出不同品类的平均复购率。

代码如下：