In [2]:
import pandas as pd
import numpy as np

test_group_data = pd.read_csv('./data/test_format1.csv')
train_group_data = pd.read_csv('./data/train_format1.csv')
user_info_data = pd.read_csv('./data/user_info_format1.csv')
user_log_data = pd.read_csv('./data/user_log_format1.csv').rename(columns={'seller_id': 'merchant_id'})
#user_log_data['brand_id'] = user_log_data['brand_id'].astype(int)

In [3]:
user_info_data.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [4]:
user_log_data.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


In [5]:
train_group_data.head()

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


## 特征工程
第一，将time_stamp字段补全为完整的时间格式，并基于这个时间戳转换成季度，月，周，日的信息。

In [6]:
import datetime
user_log_data['time_stamp'] = [datetime.date(2015, int(str(i)[:-2]), int(str(i)[-2:]))
                                    for i in user_log_data['time_stamp']]
user_log_data['weekday'] = [i.weekday() for i in user_log_data['time_stamp']]
user_log_data['month'] = [i.month for i in user_log_data['time_stamp']]
user_log_data['season'] = ['summer' if i in [5, 6, 7] else 'autumn' for i in user_log_data['month']]
user_log_data['day'] = [i.day for i in user_log_data['time_stamp']]
# 以2015年一月一号为起点，计算出当前时间戳距离这个起点的日数
user_log_data['dayCount'] = [(i - datetime.date(2015, 1, 1)).days for i in user_log_data['time_stamp']]

## 请务必将附录部分的代码全部运行之后再进行后续特征工程处理
第二，构造出用户的行为特征。

In [22]:
# 生成最热门的商品品类列表，详见附录-2
def gene_most_buy_labels():
    most_100_labels_list = []
    for target_column in ['item_id', 'cat_id', 'merchant_id', 'brand_id']:
        target_data = user_log_data[user_log_data['action_type'] == 2]
        most_100_labels_1 = target_data.groupby(target_column)['user_id'].count().sort_values().tail(100).index
        most_100_labels_2 = target_data.groupby(target_column)['user_id'].unique().apply(lambda x: len(x)).sort_values().tail(100).index
        most_100_labels = set(list(most_100_labels_1) + list(most_100_labels_2))
        most_100_labels_list.append(most_100_labels)
    return most_100_labels_list
most_100_labels_list = gene_most_buy_labels()

In [23]:
from tqdm import tqdm
user_feature_record_map = {}
user_concern_cat_list = {}
user_concern_brand_list = {}
total_user_id_set = set(train_group_data['user_id']) | set(test_group_data['user_id'])

user_feature_name = [('用户年龄', 'i'), ('用户性别', 'i'), ('用户点击数量', 'f'), ('用户加购数量', 'f'), ('用户购买数量', 'f'), ('用户收藏数量', 'f'),
                        ('用户点击转化率', 'f'), ('用户加购转化率', 'f'), ('用户收藏转化率', 'f')] + \
                        [('用户周计数活跃数量-%d' % i, 'f') for i in range(7)] + [('用户周计数活跃比率-%d' % i, 'f') for i in range(7)] +\
                        [('用户月计数活跃数量-%d' % i, 'f') for i in range(5,12)] + [('用户月计数活跃比率-%d' % i, 'f') for i in range(5,12)] +\
                        [('用户双11期间活跃数量-%d' % i, 'f') for i in range(5,12)] + [('用户双11期间活跃比率-%d' % i, 'f') for i in range(5,12)] +\
                        [('用户涉及的商品数量','f'), ('用户涉及的品类数量', 'f'), ('用户涉及的商家数量', 'f'), ('用户涉及的品牌数量', 'f')] + \
                        [('用户复购-%s-%s' % (i,j), 'f') for i in ['商品', '品类', '店铺', '品牌'] for j in ['类数量', '总数量', '比例']] +\
                        [('用户双11期间购买量占比', 'f'), ('用户-商品交互平均间隔', 'f'), ('用户-品类交互平均间隔', 'f'), ('用户-商家交互平均间隔', 'f'),
                        ('用户-品牌交互平均间隔', 'f')] + [('用户-热门商品交互数量', 'f'), ('用户-热门品类交互数量', 'f'), ('用户-热门店铺交互数量', 'f'),
                        ('用户-热门品牌交互数量', 'f')] +\
                        [('用户-双11点击占比', 'f'), ('用户-双11收藏占比', 'f')]

# 暂时先处理200个用户作为案例，全部用户的处理需要大约7个小时
for user_id in tqdm(list(total_user_id_set)):
    #print('正在处理用户：', user_id)
    user_id_data = user_log_data[user_log_data['user_id'] == user_id]
    # 该用户的年龄和性别
    _, user_age, user_gender = user_info_data[user_info_data['user_id']==user_id].values[0]
    # 该用户的总点击行为，加购行为，购买行为和收藏行为数量
    action_type_count = user_id_data['action_type'].value_counts()
    user_action_click = action_type_count.get(0, 0)
    user_action_toCart = action_type_count.get(1, 0)
    user_action_purchase = action_type_count.get(2, 0)
    user_action_collect = action_type_count.get(3, 0)
    # 该用户的点击转化率，加购转化率和收藏转化率
    user_action_click_to_purchase_rate = user_action_click / user_action_purchase
    user_action_toCart_to_purchase_rate = user_action_toCart / user_action_purchase
    user_action_collect_to_purchase_rate = user_action_collect / user_action_purchase
    # 计算出该用户在一周7天的活跃次数以及比例情况，这其中可能反应用户的作息规律和职业特性
    weekday_active_count = user_id_data.groupby('weekday')['user_id'].count()
    user_active_count_in_week = [weekday_active_count.get(i, 0) for i in range(7)]
    user_active_ratio_in_week = [i / max(1, sum(user_active_count_in_week)) for i in user_active_count_in_week]
    # 计算出该用户在5,6,7,8,9,10,11月分别的活跃次数和活跃比率
    month_active_count = user_id_data.groupby('month')['user_id'].count()
    user_active_count_in_mouth = [month_active_count.get(i, 0) for i in [5, 6, 7, 8, 9, 10, 11]]
    user_active_ratio_in_mouth = [i / max(1, sum(user_active_count_in_mouth)) for i in user_active_count_in_mouth]
    # 计算出消费者在双十一活动前n天的活跃次数和活跃比率（这里的n需要根据业务经验来确定）
    # 你来完成这个指标的计算
    user_active_in_11_span = []
    for date in range(5, 12):
        date_active_num = user_id_data[user_id_data['time_stamp'] == datetime.date(2015, 11, date)]['user_id'].count()
        user_active_in_11_span.append(date_active_num)
    user_active_in_11_span_ratio = [i/max(1, sum(user_active_in_11_span)) for i in user_active_in_11_span]
    # 消费者行为涉及的具体商品个数（这里的消费者行为包括全部的action_type）
    user_touch_item_num = len(set(user_id_data['item_id'].values))
    # 消费者行为涉及的类别个数
    user_touch_cat_num = len(set(user_id_data['cat_id'].values))
    # 消费者行为涉及的商家个数
    user_touch_merchant_num = len(set(user_id_data['merchant_id'].values))
    # 消费者行为涉及的品牌个数
    user_touch_brand_num = len(set(user_id_data['brand_id'].values))
    
    
    # 消费者复购的商品个数，类别个数，商家个数，品牌个数
    # 首先把该用户的全部购买样本筛选出来
    user_id_purchase_data = user_id_data[user_id_data['action_type']==2]
    # 双11期间购买次数占总购买次数的比例
    user_buy_in_11_ratio = len(user_id_purchase_data[user_id_purchase_data['time_stamp'] > datetime.date(2015, 11, 10)])/len(user_id_purchase_data)

    def temp_assist_func(column_name):
        # 将该用户的全部购买样本按照某一个指标进行分组计数
        c_count_map = user_id_purchase_data.groupby(column_name)['user_id'].count()
        c_buy_2up_map = c_count_map[c_count_map>1]
        # 购买次数大于1次的品类数量
        c_num = len(c_buy_2up_map)
        # 各品类的总复购次数
        c_rebuy_total_num = sum(c_buy_2up_map) - c_num
        # 复购率
        c_rebuy_ratio = c_rebuy_total_num / max(1, sum(c_count_map))
        return c_num, c_rebuy_total_num, c_rebuy_ratio
    
    # 消费者重复购买商品的统计情况
    user_rebuy_item_cate_num, user_rebuy_item_total_num, user_rebuy_item_ratio = temp_assist_func('item_id')
    # 消费者重复购买品类的统计情况
    user_rebuy_cat_cate_num, user_rebuy_cat_total_num, user_rebuy_cat_ratio = temp_assist_func('cat_id')
    # 消费者重复购买店铺的统计情况
    user_rebuy_mer_cate_num, user_rebuy_mer_total_num, user_rebuy_mer_ratio = temp_assist_func('merchant_id')
    # 消费者重复购买品牌的统计情况
    user_rebuy_brand_cate_num, user_rebuy_brand_total_num, user_rebuy_brand_ratio = temp_assist_func('brand_id')
    # 消费者交互行为的间隔天数波动水平
    def temp_assist_func_2(column_name):
        # 该操作的说明见附录-1
        c_std_mean = user_id_purchase_data.groupby(column_name)['dayCount'].std().fillna(0).mean()
        return c_std_mean
    user_react_item_days_std = temp_assist_func_2('item_id')
    user_react_cat_days_std = temp_assist_func_2('cat_id')
    user_react_mer_days_std = temp_assist_func_2('merchant_id')
    user_react_brand_days_std = temp_assist_func_2('brand_id')
    
    # 消费者关注过热门商品
    user_concern_hot_item = len(set(user_id_data['item_id'].values) & most_100_labels_list[0])
    # 消费者关注过热门品类
    user_concern_hot_cat = len(set(user_id_data['cat_id'].values) & most_100_labels_list[1])
    # 消费者关注过热门店铺
    user_concern_hot_mer = len(set(user_id_data['merchant_id'].values) & most_100_labels_list[2])
    # 消费者关注过热门品牌
    user_concern_hot_brand = len(set(user_id_data['brand_id'].values) & most_100_labels_list[3])
    
    # 消费者在双11当天的点击行为占比
    user_in_11_click_ratio = len(user_id_data[user_id_data['time_stamp'] == datetime.date(2015,11,11)].query('action_type==0'))\
                                / max(1, user_action_click)
    # 消费者在双11当天的收藏行为占比
    user_in_11_collect_ratio = len(user_id_data[user_id_data['time_stamp'] == datetime.date(2015,11,11)].query('action_type==3'))\
                                / max(1, user_action_collect)
    
    # 消费者关注的品类列表
    user_concern_cat_list[user_id] = list(user_id_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-20:])
    # 消费者关注的品牌列表
    user_concern_brand_list[user_id] = list(user_id_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-20:])
    
    
    user_feature = [user_age, user_gender, user_action_click, user_action_toCart, user_action_purchase, user_action_collect,
                   user_action_click_to_purchase_rate, user_action_toCart_to_purchase_rate, user_action_collect_to_purchase_rate,
                   ] + \
                   user_active_count_in_week + user_active_ratio_in_week + \
                   user_active_count_in_mouth + user_active_ratio_in_mouth + \
                   user_active_in_11_span + user_active_in_11_span_ratio + \
                   [user_touch_item_num, user_touch_cat_num, user_touch_merchant_num, user_touch_brand_num] + \
                   [user_rebuy_item_cate_num, user_rebuy_item_total_num, user_rebuy_item_ratio, 
                   user_rebuy_cat_cate_num, user_rebuy_cat_total_num, user_rebuy_cat_ratio,
                   user_rebuy_mer_cate_num, user_rebuy_mer_total_num, user_rebuy_mer_ratio,
                   user_rebuy_brand_cate_num, user_rebuy_brand_total_num, user_rebuy_brand_ratio] + \
                   [user_buy_in_11_ratio, user_react_item_days_std, user_react_cat_days_std, user_react_mer_days_std, user_react_brand_days_std] +\
                   [user_concern_hot_item, user_concern_hot_cat, user_concern_hot_mer, user_concern_hot_brand] + \
                   [user_in_11_click_ratio, user_in_11_collect_ratio]
    
    user_feature_record_map[user_id] = user_feature

user_feature_dataframe = pd.DataFrame(user_feature_record_map.values(), columns=[i[0] for i in user_feature_name], 
                                      index=user_feature_record_map.keys())
import pickle

with open('data/user_feature_dataframe.pk', 'wb') as f:
    pickle.dump(user_feature_dataframe, f)

100%|████████████████████████████████████████████████████████████████████████| 424170/424170 [7:59:49<00:00, 14.73it/s]


In [27]:
user_feature_dataframe.head(20)

Unnamed: 0,用户年龄,用户性别,用户点击数量,用户加购数量,用户购买数量,用户收藏数量,用户点击转化率,用户加购转化率,用户收藏转化率,用户周计数活跃数量-0,...,用户-商品交互平均间隔,用户-品类交互平均间隔,用户-商家交互平均间隔,用户-品牌交互平均间隔,用户-热门商品交互数量,用户-热门品类交互数量,用户-热门店铺交互数量,用户-热门品牌交互数量,用户-双11点击占比,用户-双11收藏占比
1,3.0,1.0,27,0,6,0,4.5,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0,5,0,0,0.481481,0.0
2,3.0,0.0,47,0,14,2,3.357143,0.0,0.142857,20,...,0.0,0.0,0.0,0.0,1,12,2,3,0.0,0.5
3,3.0,0.0,63,0,4,1,15.75,0.0,0.25,27,...,0.0,0.0,0.0,0.0,0,12,1,2,0.111111,0.0
4,0.0,0.0,49,0,1,0,49.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0,13,1,2,0.142857,0.0
5,5.0,0.0,150,0,13,10,11.538462,0.0,0.769231,11,...,0.0,0.0,0.0,0.0,2,35,10,14,0.186667,0.1
6,4.0,0.0,217,0,17,15,12.764706,0.0,0.882353,65,...,1.414214,1.414214,1.532065,1.414214,3,29,8,11,0.02765,0.0
7,4.0,0.0,6,0,8,0,0.75,0.0,0.0,0,...,0.0,0.0,0.235702,0.235702,0,3,1,2,0.0,0.0
8,4.0,0.0,61,0,23,7,2.652174,0.0,0.304348,16,...,2.532358,4.693725,4.933384,1.781742,2,8,3,7,0.065574,0.142857
9,5.0,0.0,79,0,4,4,19.75,0.0,1.0,18,...,0.0,0.0,0.0,0.0,1,18,0,1,0.063291,0.0
10,4.0,0.0,56,0,7,1,8.0,0.0,0.142857,1,...,0.0,0.0,0.0,0.0,1,12,1,1,0.160714,0.0


第三，构建店铺的特征。

In [32]:
import math
total_merchant_id_set = set(train_group_data['merchant_id']) | set(test_group_data['merchant_id'])
merchant_feature_record_map = {}
merchant_sell_cat_list = {}
merchant_sell_brand_list = {}
top_cat_or_brand_n = 5
merchant_feature_name = [('店铺点击总数', 'f'), ('店铺加购总数', 'f'), ('店铺购买总数', 'f'), ('店铺收藏总数', 'f'), ] + \
        [('店铺购买比加购系数', 'f'), ('店铺收藏比购买系数', 'f'), ('店铺加购比点击系数', 'f'), ] + \
        [('店铺商品总数量', 'f'), ('店铺品类总数量', 'f'), ('店铺品牌总数量', 'f'), ('店铺潜在用户总数量', 'f'), ('店铺实际用户总数量', 'f'), ]+\
        [('店铺-上架混乱系数', 'f'), ('店铺-商品关注混乱系数', 'f'), ('店铺-品类关注混乱系数', 'f'), ('店铺-品牌关注混乱系数', 'f'), ] + \
        [('店铺-复购总人数', 'f'), ('店铺-复购人群占比', 'f'), ('店铺-重复查看商品比例', 'f'), ('店铺-重复查看品类比例', 'f'), 
             ('店铺-重复查看品牌比例', 'f'), ]+ \
        [('店铺-主要销售品类-%d' % (i+1), 'i') for i in range(top_cat_or_brand_n)] + [('店铺-主要销售品牌-%d' % (i+1), 'i') for i in range(top_cat_or_brand_n)]+\
        [('店铺-品类竞争强度', 'f'), ('店铺-品牌竞争强度', 'f'), ('店铺-热门品类数量', 'f'), ('店铺-热门品牌数量', 'f'), ] + \
        [('店铺-正向品类数量', 'f'), ('店铺-正向品牌数量', 'f'), ('店铺-负向品类数量', 'f'), ('店铺-负向品牌数量', 'f'), ] + \
        [('店铺-品类市场占有率', 'f'), ('店铺-品牌市场占有率', 'f'), ('店铺-双11销售占比', 'f'), ('店铺-双11交互占比', 'f'), ] + \
        [('店铺-双11转化率', 'f'), ('店铺-双11收藏率', 'f'), ('店铺-节后复购数量', 'f'), ('店铺-节后复购比例', 'f'), ] + \
        [('店铺-是否为高复购店铺', 'f'), ] + [('店铺-月度销售量-%d' % i, 'f') for i in range(5, 11)]
for merchant_id in tqdm(list(total_merchant_id_set)):
    merchant_id_data = user_log_data[user_log_data['merchant_id'] == merchant_id]
    merchant_id_sell_data = merchant_id_data.query('action_type==2')
    
    merchant_action_count_map = merchant_id_data.groupby('action_type')['user_id'].count()
    # 店铺总体的点击数量
    merchant_action_click_num = merchant_action_count_map.get(0, 0)
    # 店铺总体的加购数量
    merchant_action_toCart_num = merchant_action_count_map.get(1, 0)
    # 店铺总体的购买数量
    merchant_action_purchase_num = merchant_action_count_map.get(2, 0)
    # 店铺总体的收藏数量
    merchant_action_collect_num = merchant_action_count_map.get(3, 0)
    
    # 店铺的购买比加购系数
    merchant_action_purchase_div_toCart = merchant_action_purchase_num / max(1, merchant_action_toCart_num)
    # 店铺的收藏比购买系数
    merchant_action_collect_div_purchase = merchant_action_collect_num / max(1, merchant_action_purchase_num)
    # 店铺的加购比点击系数
    merchant_action_toCart_div_click = merchant_action_toCart_num / max(1, merchant_action_click_num)
    
    # 店铺中的商品总数
    merchant_store_item_num = len(merchant_id_data['item_id'].unique())
    # 店铺中的品类总数
    merchant_store_cat_num = len(merchant_id_data['cat_id'].unique())
    # 店铺中的品牌总数
    merchant_store_brand_num = len(merchant_id_data['brand_id'].unique())
    # 店铺中的潜在消费者总数
    merchant_store_p_user_num = len(merchant_id_data['user_id'].unique())
    # 店铺中的实际消费者总数
    merchant_store_a_user_num = len(merchant_id_data[merchant_id_data['action_type'] == 2]['user_id'].unique())
    
    # 店铺中的商品上架混乱程度
    merchant_store_entropy_coef = merchant_store_item_num / np.sqrt(merchant_store_cat_num * merchant_store_brand_num)
    # 店铺中的商品关注混乱程度
    merchant_concern_entropy_coef_item = merchant_id_data.groupby('item_id')['user_id'].count().std()
    # 店铺中的品类关注混乱程度
    merchant_concern_entropy_coef_cat = merchant_id_data.groupby('cat_id')['user_id'].count().std()
    # 店铺中的品牌关注混乱程度
    merchant_concern_entropy_coef_brand = merchant_id_data.groupby('brand_id')['user_id'].count().std()
    
    # 店铺总的重复购买消费者人数
    merchant_id_purchase_data = merchant_id_data[merchant_id_data['action_type'] == 2]
    temp_a_data = merchant_id_purchase_data.groupby('user_id')['user_id'].count()
    merchant_rebuy_user_num = temp_a_data[temp_a_data > 1].count()
    merchant_rebuy_user_ratio = merchant_rebuy_user_num / temp_a_data.count()
    # 店铺中商品被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'item_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_item_ratio = b.count() / max(1, a.count())
    # 店铺中品类被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'cat_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_cat_ratio = b.count() / max(1, a.count())
    # 店铺中品牌被重复查看的比例
    a = merchant_id_data[merchant_id_data['action_type'] == 0].groupby(['user_id', 'brand_id'])['user_id'].count()
    b = a[a>1]
    merchant_reClick_brand_ratio = b.count() / max(1, a.count())
    
    # 店铺主要销售的品类(数量最大的三个,如果不够三个的话，就将最大的复制，补全3个)
    merchant_main_cat = merchant_id_sell_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-top_cat_or_brand_n:]
    merchant_main_cat = list(merchant_main_cat[::-1]) + [merchant_main_cat[0] for i in range(top_cat_or_brand_n - len(merchant_main_cat))]
    # 店铺主要销售的品牌(数量最大的三个,如果不够三个的话，就将最大的复制，补全3个)
    merchant_main_brand = merchant_id_sell_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-top_cat_or_brand_n:]
    merchant_main_brand = list(merchant_main_brand[::-1]) + [merchant_main_brand[0] for i in range(top_cat_or_brand_n - len(merchant_main_brand))]
    # 根据商家出售的主要品类以及该品类在市场上竞争者的多少，计算出一个平均竞争强度
    merchant_cat_compete_coef = np.mean([cat_compete_score_map[item_cat_id] for item_cat_id in merchant_main_cat])
    merchant_brand_compete_coef = np.mean([brand_compete_score_map[item_brand_id] for item_brand_id in merchant_main_brand])
    # 店铺销售的热门品类数量
    merchant_hot_cat_num = len(set(merchant_id_data['cat_id'].values) & most_100_labels_list[1])
    # 店铺销售的热门品牌数量
    merchant_hot_brand_num = len(set(merchant_id_data['brand_id'].values) & most_100_labels_list[3])
    # 店铺销售的正向品类数量为
    merchant_positive_cat_num = len(set(merchant_id_data['cat_id'].values) & best_200_cat_list)
    # 店铺销售的负向品类数量为
    merchant_negtive_cat_num =  len(set(merchant_id_data['cat_id'].values) & worst_200_cat_list)
    # 店铺销售的正向品牌数量为
    merchant_positive_brand_num = len(set(merchant_id_data['brand_id'].values) & best_200_brand_list)
    # 店铺销售的负向品牌数量为
    merchant_negtive_brand_num =  len(set(merchant_id_data['brand_id'].values) & worst_200_brand_list)
    # 店铺销售主要品类的平均市场占有率
    merchant_market_cat_ratio = [len(merchant_id_sell_data.query('cat_id==%s' % item_cat_id)) / cat_total_sell_amount_map[item_cat_id] 
                                 for item_cat_id in merchant_main_cat]
    merchant_market_cat_ratio = np.mean(merchant_market_cat_ratio)
    # 店铺销售主要品牌的平均市场占有率
    merchant_market_brand_ratio = [len(merchant_id_sell_data.query('brand_id==%s' % item_brand_id)) / brand_total_sell_amount_map[item_brand_id] 
                                 for item_brand_id in merchant_main_brand]
    merchant_market_brand_ratio = np.mean(merchant_market_brand_ratio)
    
    # 店铺双11当天的销售量占比
    merchant_11_sell_ratio = len(merchant_id_sell_data[merchant_id_sell_data['time_stamp'] == datetime.date(2015, 11, 11)])\
                                    / len(merchant_id_sell_data)
    # 店铺双11当天的互动量占比
    merchant_11_active_ratio = len(merchant_id_data[merchant_id_data['time_stamp'] == datetime.date(2015, 11, 11)])\
                                    / len(merchant_id_data)
    # 店铺双11当天的转化率（销售量/点击量）
    merchant_11_trans_ratio = len(merchant_id_sell_data[merchant_id_sell_data['time_stamp'] == datetime.date(2015, 11, 11)])\
                            / len(merchant_id_data[merchant_id_data['time_stamp'] == datetime.date(2015, 11, 11)].query('action_type==0'))
    # 店铺双11当天的收藏率（收藏量/购买量）
    merchant_11_collect_ratio = len(merchant_id_data[merchant_id_data['time_stamp'] == datetime.date(2015, 11, 11)].query('action_type==3'))\
                            / len(merchant_id_data[merchant_id_data['time_stamp'] == datetime.date(2015, 11, 11)].query('action_type==2'))
    
    # 店铺双11节后的复购比例
    merchant_after_11_rebuy_ratio = after_11_merchant_rebuy_ratio.get(merchant_id, 0.01)
    # 店铺双11节后的复购数量
    merchant_after_11_appance_count = after_11_merchant_rebuy_count.get(merchant_id, 0)
    merchant_after_11_rebuy_count = int(merchant_after_11_appance_count * merchant_after_11_rebuy_ratio)
    # 再次基于二项试验来检验这个店铺是否为明显偏离平均复购水平的店铺
    def bonulli_func(n, m, p=0.0612):
            return math.factorial(n) / (math.factorial(m) * math.factorial(n-m)) * p**m * (1-p)**(n-m)
    
    if merchant_after_11_appance_count < 1000:
        p = bonulli_func(merchant_after_11_appance_count, merchant_after_11_rebuy_count)
    else:
        p = 0.00001
    
    if p < 0.01:
        if merchant_after_11_rebuy_ratio<0.0612:
            #print('发现一个复购比例显著低于平均水平的店铺(%.4f)' % merchant_after_11_rebuy_ratio)
            merchant_is_positive = -1
        else:
            #print('发现一个复购比例显著高于平均水平的店铺(%.4f)' % merchant_after_11_rebuy_ratio)
            merchant_is_positive = 1
    else:
        merchant_is_positive = 0
    # 店铺在5,6,7,8,9,10月分别的销售量
    merchant_month_sell_amount = [merchant_id_sell_data.groupby('month')['user_id'].count().get(month, 0) for month in range(5, 11)]
    
    merchant_feature = [merchant_action_click_num, merchant_action_toCart_num, merchant_action_purchase_num, merchant_action_collect_num] + \
            [merchant_action_purchase_div_toCart, merchant_action_collect_div_purchase, merchant_action_toCart_div_click] +\
            [merchant_store_item_num, merchant_store_cat_num, merchant_store_brand_num, merchant_store_p_user_num, merchant_store_a_user_num] + \
            [merchant_store_entropy_coef, merchant_concern_entropy_coef_item, merchant_concern_entropy_coef_cat, merchant_concern_entropy_coef_brand] + \
            [merchant_rebuy_user_num, merchant_rebuy_user_ratio, merchant_reClick_item_ratio, merchant_reClick_cat_ratio, merchant_reClick_brand_ratio] + \
            merchant_main_cat + merchant_main_brand + \
            [merchant_cat_compete_coef, merchant_brand_compete_coef, merchant_hot_cat_num, merchant_hot_brand_num] +\
            [merchant_positive_cat_num, merchant_positive_brand_num, merchant_negtive_cat_num, merchant_negtive_brand_num] + \
            [merchant_market_cat_ratio, merchant_market_brand_ratio, merchant_11_sell_ratio, merchant_11_active_ratio] + \
            [merchant_11_trans_ratio, merchant_11_collect_ratio, merchant_after_11_rebuy_count, merchant_after_11_rebuy_ratio] + \
            [merchant_is_positive] + merchant_month_sell_amount
                            
    merchant_feature_record_map[merchant_id] = merchant_feature
    
    # 商家出售的主要品类列表
    merchant_sell_cat_list[merchant_id] = list(merchant_id_data.groupby('cat_id')['user_id'].count().sort_values().index.values[-30:])
    # 商家出售的主要品牌列表
    merchant_sell_brand_list[merchant_id] = list(merchant_id_data.groupby('brand_id')['user_id'].count().sort_values().index.values[-30:])
    
merchant_feature_dataframe = pd.DataFrame(merchant_feature_record_map.values(), columns=[i[0] for i in merchant_feature_name],
                                         index=merchant_feature_record_map.keys())

with open('data/merchant_feature_dataframe.pk', 'wb') as f:
    pickle.dump(merchant_feature_dataframe, f)

100%|██████████████████████████████████████████████████████████████████████████████| 1994/1994 [03:59<00:00,  8.32it/s]


In [35]:
merchant_feature_dataframe.head(20)

Unnamed: 0,店铺点击总数,店铺加购总数,店铺购买总数,店铺收藏总数,店铺购买比加购系数,店铺收藏比购买系数,店铺加购比点击系数,店铺商品总数量,店铺品类总数量,店铺品牌总数量,...,店铺-双11收藏率,店铺-节后复购数量,店铺-节后复购比例,店铺-是否为高复购店铺,店铺-月度销售量-5,店铺-月度销售量-6,店铺-月度销售量-7,店铺-月度销售量-8,店铺-月度销售量-9,店铺-月度销售量-10
2,2030,8,189,144,23.625,0.761905,0.003941,154,10,2,...,0.04,2,0.033898,0,9,6,4,5,22,13
8,3540,3,395,306,131.666667,0.774684,0.000847,96,13,2,...,0.035714,1,0.045455,0,25,48,60,51,49,49
9,2096,7,94,65,13.428571,0.691489,0.00334,271,17,22,...,0.116667,1,0.032258,0,6,0,4,2,5,7
10,19125,64,1133,866,17.703125,0.764342,0.003346,463,13,3,...,0.176166,16,0.035088,-1,23,24,1,14,29,45
13,6589,15,937,301,62.466667,0.321238,0.002277,45,10,2,...,0.061753,15,0.082873,0,15,78,17,6,27,192
14,5709,3,379,305,126.333333,0.804749,0.000525,224,13,2,...,0.091603,4,0.042553,0,6,13,13,20,25,29
15,1616,4,178,52,44.5,0.292135,0.002475,55,3,3,...,0.029586,3,0.078947,0,0,0,0,0,0,8
20,7509,15,480,342,32.0,0.7125,0.001998,556,25,2,...,0.03966,6,0.065421,0,22,38,12,14,19,22
22,1876,3,192,105,64.0,0.546875,0.001599,33,5,2,...,0.1,0,0.0,0,32,20,8,3,16,28
23,3789,4,209,289,52.25,1.382775,0.001056,404,4,2,...,0.282353,0,0.0,0,0,0,0,0,8,84


第四，构建消费者和店铺之间的交互特征。

In [37]:
# 后面需要对用户进行逐一处理，如果每一个用户都进行一次全局检索，效率会很低，
# 所以需要先进行一次汇总统计
def split_data_of_userid():
    user_id_data_map = {}
    user_log_data_index = list(user_log_data.index.values)
    user_log_data_len = len(user_log_data)
    for k in tqdm(user_log_data_index):
        i = user_log_data.loc[k]
        user_id = i[0]
        if user_id not in user_id_data_map:
            user_id_data_map[user_id] = [k]
        else:
            user_id_data_map[user_id].append(k)
    return user_id_data_map

user_id_data_map = split_data_of_userid()

100%|██████████████████████████████████████████████████████████████████| 54925330/54925330 [2:15:50<00:00, 6739.27it/s]


In [46]:
with open('data/user_id_data_map.pk', 'w') as f:
    for k, v in tqdm(user_id_data_map.items()):
        s = '%d:%s\n' % (k, ','.join([str(i) for i in v]))
        f.write(s)

100%|███████████████████████████████████████████████████████████████████████| 424170/424170 [00:28<00:00, 14885.83it/s]


In [47]:
interact_feature_name = [('交互-点击总数', 'f'), ('交互-加购总数', 'f'), ('交互-购买总数', 'f'), ('交互-收藏总数', 'f'), 
                            ('交互-双11点击总数', 'f'), ('交互-双11加购总数', 'f'), ('交互-双11购买总数', 'f'), ('交互-双11收藏总数', 'f'),
                            ('交互-商品数量', 'f'), ('交互-品类数量', 'f'), ('交互-品牌数量', 'f'), ('交互-品类重叠系数', 'f'), 
                            ('交互-品牌重叠系数', 'f'), ('交互-购买主品类标签', 'f'), ('交互-购买主品牌标签', 'f'), 
                            ('交互-初始交互时间差', 'f')]
def interact_feature_func(item_user_id, item_merchant_id):
    user_id_data = user_log_data.loc[user_id_data_map[item_user_id]]
    interact_data = user_id_data.query('merchant_id==%d' % item_merchant_id)
    # 消费者和商家的总交互数量
    int_action_map = interact_data.groupby('action_type')['user_id'].count()
    #   点击数量
    int_act_click_num = int_action_map.get(0, 0)
    #   加购数量
    int_act_toCart_num = int_action_map.get(1, 0)
    #   购买数量
    int_act_purchase_num = int_action_map.get(2, 0)
    #   收藏数量
    int_act_collect_num = int_action_map.get(3, 0)
    # 消费者和商家在双11当天的交互数量
    int_action_in_11_map = interact_data[interact_data['time_stamp'] == datetime.date(2015, 11, 11)]\
                                    .groupby('action_type')['user_id'].count()
    #   双11当天的点击数量
    int_act_11_click_num = int_action_in_11_map.get(0, 0)
    #   双11当天的加购数量
    int_act_11_toCart_num = int_action_in_11_map.get(1, 0)
    #   双11当天的购买数量
    int_act_11_purchase_num = int_action_in_11_map.get(2, 0)
    #   双11当天的收藏数量
    int_act_11_collect_num = int_action_in_11_map.get(3, 0)
    # 消费者在该店铺关注的商品数量
    int_item_num = len(interact_data['item_id'].unique())
    # 消费者在该店铺关注的品类数量
    int_cat_num = len(interact_data['cat_id'].unique())
    # 消费者在该店铺关注的品牌数量
    int_brand_num = len(interact_data['brand_id'].unique())
    
    # 消费者主要关注的品类与店铺主要销售的品类的重叠程度
    user_main_cat_group = set(user_concern_cat_list[item_user_id])
    merchant_main_cat_group = set(merchant_sell_cat_list[item_merchant_id])
    cat_group_inter = user_main_cat_group & merchant_main_cat_group
    cat_group_union = user_main_cat_group | merchant_main_cat_group
    cat_overlap_coef = len(cat_group_inter) / len(cat_group_union)
    # 消费者在双11购买的品类是否是消费者主要关注的品类
    in_11_buy_cat = interact_data[interact_data['time_stamp'] == datetime.date(2015, 11, 11)].query('action_type==2')['cat_id'].iloc[0]
    if in_11_buy_cat in user_main_cat_group:
        is_main_cat = 1
    else:
        is_main_cat = 0
    # 消费者主要关注的品类与店铺主要销售的品类的重叠程度
    user_main_brand_group = set(user_concern_brand_list[item_user_id])
    merchant_main_brand_group = set(merchant_sell_brand_list[item_merchant_id])
    brand_group_inter = user_main_brand_group & merchant_main_brand_group
    brand_group_union = user_main_brand_group | merchant_main_brand_group
    brand_overlap_coef = len(brand_group_inter) / len(brand_group_union)
    # 消费者在双11购买的品类是否是消费者主要关注的品类
    in_11_buy_brand = interact_data[interact_data['time_stamp'] == datetime.date(2015, 11, 11)].query('action_type==2')['brand_id'].iloc[0]
    if in_11_buy_brand in user_main_brand_group:
        is_main_brand = 1
    else:
        is_main_brand = 0
    # 消费者首次进入到该店铺所经历的时长
    delta_days = (datetime.date(2015, 11, 12) - interact_data['time_stamp'].iloc[0]).days
    
    interact_feature = [int_act_click_num, int_act_toCart_num, int_act_purchase_num, int_act_collect_num,
                       int_act_11_click_num, int_act_11_toCart_num, int_act_11_purchase_num, int_act_11_collect_num,
                       int_item_num, int_cat_num, int_brand_num, cat_overlap_coef, brand_overlap_coef, is_main_cat, is_main_brand,
                       delta_days]
    return interact_feature

In [56]:
def concat_feature_func(data_group):
    features = []
    for item_user_id, item_merchant_id in tqdm(data_group[['user_id', 'merchant_id']].values):
        item_user_feature = list(user_feature_dataframe.loc[item_user_id])
        item_merchant_feature = list(merchant_feature_dataframe.loc[item_merchant_id])
        item_interact_feature = interact_feature_func(item_user_id, item_merchant_id)
        features.append(item_user_feature+item_merchant_feature+item_interact_feature)
    return np.array(features)

concat_feature_names = user_feature_name + merchant_feature_name + interact_feature_name
concat_feature_names = [i[0] for i in concat_feature_names]
train_group_x = pd.DataFrame(concat_feature_func(train_group_data), columns=concat_feature_names)
train_group_y = train_group_data['label']
test_group_x = pd.DataFrame(concat_feature_func(test_group_data), columns=concat_feature_names)

100%|██████████████████████████████████████████████████████████████████████████| 260864/260864 [45:26<00:00, 95.68it/s]
100%|██████████████████████████████████████████████████████████████████████████| 261477/261477 [46:07<00:00, 94.50it/s]


In [59]:
concat_feature_names = user_feature_name + merchant_feature_name + interact_feature_name
with open('data/final_data.pk', 'wb') as f:
    pickle.dump([train_group_x, train_group_y, test_group_x, concat_feature_names], f)

### 附录-1: 某一汇总指标下重复交互行为的间隔天数波动情况计算
这里的汇总指标主要是指item_id,cat_id,merchant_id,brand_id等，指消费者重复关注于某一商品，某一类别，某一商家或某一品牌的行为通常会有多长时间的间隔，这反映了消费者是不是有“留恋”习惯。当然，这里没有只使用购买行为，而包含了全部行为在内，是因为购买行为数量较少，样本不具有代表性。以cat_id为例，首先，就该用户的涉及的cat_id对样本进行分组，比如cat_id=737这一品类下共有如下样本：

In [7]:
user_id_data = user_log_data[user_log_data['user_id'] == 502]
user_id_data[user_id_data['cat_id'] == 737]

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type,weekday,month,season,day,dayCount
21730474,502,306823,737,3904,3876.0,2015-05-21,0,3,5,summer,21,140
21730482,502,895126,737,1844,8455.0,2015-05-14,2,3,5,summer,14,133
21730496,502,134678,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730497,502,468097,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730499,502,481791,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730500,502,275938,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730501,502,481483,737,2124,7666.0,2015-06-01,2,0,6,summer,1,151
21730502,502,1041982,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730505,502,363585,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151
21730506,502,77939,737,2124,7666.0,2015-06-01,0,0,6,summer,1,151


从该表格可以看出，该用户对该品类的关注在时间上还是很集中的。所以我们可以用dayCount上的标准差来衡量这种波动程度。即：

In [8]:
user_id_data[user_id_data['cat_id'] == 737]['dayCount'].std()

4.605203007125973

在这里例子上，平均4.6天会关注一次这个品类的商品。所以我们把该用户涉及的各个品类上的这一指标都计算出来，即：

In [9]:
user_id_data.groupby('cat_id')['dayCount'].std()

cat_id
177     0.000000
267     0.000000
276          NaN
389          NaN
662     3.130495
737     4.605203
821     0.000000
946     2.828427
1034         NaN
1095         NaN
1188    2.121320
1213         NaN
1280    0.000000
1397    0.000000
1429         NaN
1505    0.000000
Name: dayCount, dtype: float64

`NaN`的位置是说明该cat_id下只有一次交互，即一个样本，不能够计算标准差，其实等价于标准差为0。`0`的位置是说明这里有多次交互，但是在同一天，所以标准差为0。我们将不同cat_id的交互间隔标准差求一个平均值，作为对该用户“留恋行为”的刻画。

In [10]:
user_id_data.groupby('cat_id')['dayCount'].std().fillna(0).mean()

0.79284035274572

从结果来看，该用户对同一品类的平均留恋时间为1.27天左右，即该用户如果在1.27天内重新与该品类进行了交互，那么交互行为就可以继续持续，否则就不会再出现。

### 附录-2：用户是否参与过或购买过热门商品，热门品类，热门店铺或者热门品牌
这一指标可以反映出用户是否具有“跟风特征”。这里的核心难题在于如何判断一个商品或者一个店铺属于热门，有两种最直观的评价标准，第一种是销售量最大，第二种是涉及的用户数量最大。所以我们从这两个角度出发，筛选出销售量最大的一部分商品和受众最多的一部分商品。

In [11]:
# 以item_id为例子
target_column = 'item_id'
# 以用户的购买行为为评价基础
target_data = user_log_data[user_log_data['action_type'] == 2]
# 交互行为最多的100中商品
most_100_labels_1 = target_data.groupby(target_column)['user_id'].count().sort_values().tail(100).index
# 涉及人群最多的100中商品
most_100_labels_2 = target_data.groupby(target_column)['user_id'].unique().apply(lambda x: len(x)).sort_values().tail(100).index
most_100_labels = set(list(most_100_labels_1) + list(most_100_labels_2))

我们从这四个方面都筛选出最热门的，然后判断该用户是否购买过或者关注过这些热门商品或店铺。

### 附录-3：筛选正向品类和负向品类，正向品牌和负向品牌
一般来说，某一些品类或者品牌通常会对复购有明显的正面影响或者负面影响，如果商家在售卖这种品类或者品牌，那么就相应的影响。所以要利用这一点，就需要将正向品类与负向品类识别出来。第一步，将不同品类或品牌的竞争强度计算出来，即出售该品类或者品牌的商家数量就是竞争强度。

In [12]:
cat_compete_score_map = user_log_data.groupby('cat_id')['merchant_id'].unique().apply(lambda x: len(x))
brand_compete_score_map = user_log_data.groupby('brand_id')['merchant_id'].unique().apply(lambda x: len(x))

第二步，根据实际的复购情况，估算出每一个品类或者品牌的复购概率。具体来说，有两种复购概率，第一种是基于用户在双十一之前的历史数据去估计每一个品类的复购概率。第二种是基于用户在双十一之后的复购记录去评估该品类的复购概率。

先来实现第一种复购概率的估计。

In [13]:
# 抽取全部的用户购买记录
user_log_buy_data = user_log_data[user_log_data['action_type'] == 2]
# 按照cat_id和user_id进行分组并计数
a = user_log_buy_data.groupby(['cat_id', 'user_id'])['item_id'].count()
# 重新整理统计数据的格式
index_user_id = [i[1] for i in a.index.values]
index_cat_id = [i[0] for i in a.index.values]
b = pd.DataFrame({'user_id': index_user_id, 'number': a.values, 'cat_id': index_cat_id})
# 统计每一个cat_id下购买的人数
c = b.groupby('cat_id')['user_id'].count()
# 统计每一个cat_id下购买二次及以上的人数
d = b[b['number'] > 1].groupby('cat_id')['user_id'].count()
# 计算出每一个cat_id的复购率
cat_id_rebuy_record_map_1 = (d/c).dropna()

# 同样的计算逻辑，可以得到品牌的复购概率
# 按照brand_id和user_id进行分组并计数
a = user_log_buy_data.groupby(['brand_id', 'user_id'])['item_id'].count()
# 重新整理统计数据的格式
index_user_id = [i[1] for i in a.index.values]
index_brand_id = [i[0] for i in a.index.values]
b = pd.DataFrame({'user_id': index_user_id, 'number': a.values, 'brand_id': index_brand_id})
# 统计每一个cat_id下购买的人数
c = b.groupby('brand_id')['user_id'].count()
# 统计每一个cat_id下购买二次及以上的人数
d = b[b['number'] > 1].groupby('brand_id')['user_id'].count()
# 计算出每一个cat_id的复购率
brand_id_rebuy_record_map_1 = (d/c).dropna()

再来实现第二种复购概率的估计。结果数据中的所谓复购，即label=1指的是双11当天购买过之后，在随后6个月有再次购买的行为。从这一信息中首先可以推算出店铺本身的在未来6个月的复购概率，之前计算的是双11前6个月的复购概率，即：

In [14]:
after_11_merchant_rebuy_ratio = train_group_data.groupby('merchant_id')['label'].mean()
after_11_merchant_rebuy_count = train_group_data.groupby('merchant_id')['label'].count()
after_11_merchant_rebuy_ratio

merchant_id
2       0.033898
8       0.045455
9       0.032258
10      0.035088
13      0.082873
          ...   
4987    0.000000
4988    0.083333
4991    0.000000
4992    0.150206
4993    0.030534
Name: label, Length: 1993, dtype: float64

如何估计品牌或者品类在未来6个月的复购概率呢？一种符合直接的方法是通过猜测出结果数据集中消费者和商家的那次交互对于的品类和品牌，为了实现这一目标，（1）先推断出双11当天该消费者在该店铺购买的品牌或者品类；（2）根据品类或者品牌将结果数据进行分组，并统计出不同品类的平均复购率。

代码如下：

In [16]:
from tqdm import tqdm
# 挑选出2015-11-11当天的全部购买订单
at_11_buy_data = user_log_data[user_log_data['action_type'] == 2][user_log_data['time_stamp'] == datetime.date(2015, 11, 11)]
# 根据train_group_data中的记录，循环进行处理
cat_id_rebuy_record_list_2 = {}
brand_id_rebuy_record_list_2 = {}
# 这里为了加快计算，只使用了十分之一的数据，在正式计算中会使用全部数据
for user_id, merchant_id, label in tqdm(train_group_data.values[::3]):
    target_item_data = at_11_buy_data.query('user_id==%d and merchant_id==%d' % (user_id, merchant_id))[['cat_id', 'brand_id']]
    for item_cat_id, item_brand_id in target_item_data.values:
        if item_cat_id not in cat_id_rebuy_record_list_2:
            cat_id_rebuy_record_list_2[item_cat_id] = []
        if item_brand_id not in brand_id_rebuy_record_list_2:
            brand_id_rebuy_record_list_2[item_brand_id] = []
        # 一般来说应该是只有一条购买记录的。如果有多条购买记录，那么这一次复购就被均分到几个类别或者品牌上。因为我们
        # 无法区分复购了哪一个品类或者品牌。
        cat_id_rebuy_record_list_2[item_cat_id].append(label/len(target_item_data))
        brand_id_rebuy_record_list_2[item_brand_id].append(label/len(target_item_data))
# 这里我们要求至少要有三次以上的出现次数，才认为估算出来的复购率比较可靠
cat_id_rebuy_record_map_2 = {k: np.mean(v) for k, v in cat_id_rebuy_record_list_2.items() if len(v) > 3}
brand_id_rebuy_record_map_2 = {k: np.mean(v) for k, v in brand_id_rebuy_record_list_2.items() if len(v) > 3}

  This is separate from the ipykernel package so we can avoid doing imports until
100%|████████████████████████████████████████████████████████████████████████████| 86955/86955 [17:50<00:00, 81.23it/s]


下面对比了两种复购率计算方法下的结果差异：

In [17]:
for item_cat_id in [2.0, 5.0, 96.0, 102.0, 500.0]:
    print('品类 %d 双11前复购率：%.2f %% --- 双11后复购率：%.2f %%' % 
          (item_cat_id, cat_id_rebuy_record_map_1[item_cat_id]*100, cat_id_rebuy_record_map_2[item_cat_id]*100))

品类 2 双11前复购率：5.24 % --- 双11后复购率：5.63 %
品类 5 双11前复购率：25.89 % --- 双11后复购率：4.02 %
品类 96 双11前复购率：60.00 % --- 双11后复购率：2.26 %
品类 102 双11前复购率：4.56 % --- 双11后复购率：11.90 %
品类 500 双11前复购率：7.93 % --- 双11后复购率：2.30 %


双11前各个品类和品牌的平均复购率为：

In [18]:
print('双11前不同品类的平均复购率为：%.2f %%' % (cat_id_rebuy_record_map_1.mean()*100))
print('双11前不同品牌的平均复购率为：%.2f %%' % (brand_id_rebuy_record_map_1.mean()*100))

双11前不同品类的平均复购率为：15.24 %
双11前不同品牌的平均复购率为：23.10 %


双11后各个品类和品牌的平均复购率为：

In [19]:
print('双11后不同品类的平均复购率为：%.2f %%' % (np.mean(list(cat_id_rebuy_record_map_2.values()))*100))
print('双11后不同品牌的平均复购率为：%.2f %%' % (np.mean(list(brand_id_rebuy_record_map_2.values()))*100))

双11后不同品类的平均复购率为：3.80 %
双11后不同品牌的平均复购率为：4.31 %


这里分别抽取复购率（分为双11之前和双11之后两种情况）最高的200个品类或品牌以及最低的200个品类或者品牌作为正向样本和负向样本，如果店铺销售的是复购率高的品类或者品牌，那么复购的希望就大一些，否则复购的希望就小一些。

In [20]:
worst_200_cat_list_1 = list(cat_id_rebuy_record_map_1.sort_values().index[:200])
worst_200_brand_list_1 = list(brand_id_rebuy_record_map_1.sort_values().index[:200])
best_200_cat_list_1 = list(cat_id_rebuy_record_map_1.sort_values().index[-200:])
best_200_brand_list_1 = list(brand_id_rebuy_record_map_1.sort_values().index[-200:])

temp_sorted_seq_cat = sorted(cat_id_rebuy_record_map_2, key=lambda x: cat_id_rebuy_record_map_2[x])
temp_sorted_seq_brand = sorted(brand_id_rebuy_record_map_2, key=lambda x: brand_id_rebuy_record_map_2[x])
worst_200_cat_list_2 = temp_sorted_seq_cat[:200]
worst_200_brand_list_2 = temp_sorted_seq_brand[:200]
best_200_cat_list_2 = temp_sorted_seq_cat[-200:]
best_200_brand_list_2 = temp_sorted_seq_brand[-200:]

worst_200_cat_list = set(worst_200_cat_list_1) | set(worst_200_cat_list_2)
worst_200_brand_list = set(worst_200_brand_list_1) | set(worst_200_brand_list_2)
best_200_cat_list = set(best_200_cat_list_1) | set(best_200_cat_list_2)
best_200_brand_list = set(best_200_brand_list_1) | set(best_200_brand_list_2)

### 附录-4：不同品类的总销售量
首先统计出不同品类的总销售量，然后就可以计算出每一个商家在该品类中的市场占有率，以判断该企业是否为行业龙头企业。

In [21]:
cat_total_sell_amount_map = user_log_buy_data.groupby('cat_id')['user_id'].count()
brand_total_sell_amount_map = user_log_buy_data.groupby('brand_id')['user_id'].count()
cat_total_sell_amount_map

cat_id
1         48
2       4442
3        214
4        418
5        839
        ... 
1665      87
1666     333
1668     675
1670      29
1671      32
Name: user_id, Length: 1324, dtype: int64