# 方案一：

In [1]:
import pandas as pd
import os
import warnings 

warnings.filterwarnings('ignore')

In [16]:
#dummy data

data = pd.DataFrame({'维度':['渠道','渠道','渠道','渠道','新老客','新老客','产品','产品','产品','产品','产品','产品'],
                     '元素':['A','B','C','D','新客','老客','C001','C002','C003','C004','C005','C006'],
                     'before':[50,3,18,4,35,40,20,15,10,8,12,10],
                     'after':[78,23,45,4,60,90,40,38,15,20,17,20]})
data

Unnamed: 0,维度,元素,before,after
0,渠道,A,50,78
1,渠道,B,3,23
2,渠道,C,18,45
3,渠道,D,4,4
4,新老客,新客,35,60
5,新老客,老客,40,90
6,产品,C001,20,40
7,产品,C002,15,38
8,产品,C003,10,15
9,产品,C004,8,20


In [3]:
#calcaulate the sum of sale amount before or after activity
#there are 3 kind, so the sum equal to sum(before)/3, same as after
pre_sum = data['before'].sum() / len(data['维度'].unique()) 
aft_sum = data['after'].sum() / len(data['维度'].unique()) 

#计算p和q值
data['p'] = data['before'] / pre_sum
data['q'] = data['after'] / aft_sum                               

In [9]:
# calculate the S
import numpy as np
import scipy

def JS_divergence(row):
    p, q = row['p'], row['q']
    M=(p+q)/2
    
    js=0.5*(p*np.log(p/M))+0.5*(q*np.log(q/M))
    
    return round(float(js),6)

data['surprise'] = data.apply(JS_divergence,axis=1)
data.sort_values(['维度', 'surprise'], ascending=False, inplace=True)
data

Unnamed: 0,维度,元素,before,after,p,q,surprise
1,渠道,B,3,23,0.04,0.153333,0.017722
0,渠道,A,50,78,0.666667,0.52,0.004543
3,渠道,D,4,4,0.053333,0.026667,0.002265
2,渠道,C,18,45,0.24,0.3,0.00167
4,新老客,新客,35,60,0.466667,0.4,0.001283
5,新老客,老客,40,90,0.533333,0.6,0.000981
10,产品,C005,12,17,0.16,0.113333,0.002002
7,产品,C002,15,38,0.2,0.253333,0.001572
8,产品,C003,10,15,0.133333,0.1,0.001195
9,产品,C004,8,20,0.106667,0.133333,0.000742


In [11]:
#计算出总销售波动，3个维度都在一起，因此也需要除以维度数量
sum_dif = (data['after'].sum() - data['before'].sum()) / len(data['维度'].unique()) 

#计算每一行数据的EP
data['EP'] = (data['after'] - data['before']) / sum_dif

data.head(6)

Unnamed: 0,维度,元素,before,after,p,q,surprise,EP
1,渠道,B,3,23,0.04,0.153333,0.017722,0.266667
0,渠道,A,50,78,0.666667,0.52,0.004543,0.373333
3,渠道,D,4,4,0.053333,0.026667,0.002265,0.0
2,渠道,C,18,45,0.24,0.3,0.00167,0.36
4,新老客,新客,35,60,0.466667,0.4,0.001283,0.333333
5,新老客,老客,40,90,0.533333,0.6,0.000981,0.666667


In [12]:
#假设单个EP阈值teep = 0.2，总EP阈值tep = 0.8
teep = 0.2
tep = 0.8

#筛选出EP值大于单个EP阈值teep的元素
data_fil = data.loc[data['EP'] >= teep,['维度','元素','surprise','EP']]

#新建一个EP_sum列，即每个维度内EP值做累加，作为和总阈值tep对比的辅助列
data_fil['EP_sum'] = data_fil.groupby('维度')['EP'].cumsum()
data_fil

Unnamed: 0,维度,元素,surprise,EP,EP_sum
1,渠道,B,0.017722,0.266667,0.266667
0,渠道,A,0.004543,0.373333,0.64
2,渠道,C,0.00167,0.36,1.0
4,新老客,新客,0.001283,0.333333,0.333333
5,新老客,老客,0.000981,0.666667,1.0
7,产品,C002,0.001572,0.306667,0.306667
6,产品,C001,0.0,0.266667,0.573333


In [13]:
#先筛选出大于总阈值的数据
bri = data_fil.loc[data_fil['EP_sum'] >= tep,:]

#每个维度下，把超过总阈值的第一个累加EP值作为接下来的筛选门槛
bri_dim = bri.groupby('维度').head(1)[['维度','EP_sum']]

#把经过单个阈值teep筛选后的数据和每个维度筛选门槛相匹配，用于下一步计算，空缺值用总阈值tep填充
result = pd.merge(data_fil,bri_dim,left_on = '维度',right_on = '维度',how = 'left').fillna(tep)
result.columns = ['维度','元素','S','EP','EP_sum','EP_thres']

#剔除大于筛选门槛的数据，即筛选出小于或等于筛选门槛的数据
result = result.loc[result['EP_sum'] <= result['EP_thres'],:]

result

Unnamed: 0,维度,元素,S,EP,EP_sum,EP_thres
0,渠道,B,0.017722,0.266667,0.266667,1.0
1,渠道,A,0.004543,0.373333,0.64,1.0
2,渠道,C,0.00167,0.36,1.0,1.0
3,新老客,新客,0.001283,0.333333,0.333333,1.0
4,新老客,老客,0.000981,0.666667,1.0,1.0
5,产品,C002,0.001572,0.306667,0.306667,0.8
6,产品,C001,0.0,0.266667,0.573333,0.8


In [14]:
result_gp = result.groupby('维度')['S'].sum().reset_index()
print(result_gp)

    维度         S
0   产品  0.001572
1  新老客  0.002264
2   渠道  0.023935


In [15]:
#筛选出前n个影响最大的维度
n = 1

#每个维度按照surprise排序，并返回前n个维度
top_n = result_gp.sort_values('S',ascending = False).iloc[:n,:]

#根据选择的前n个维度，返回维度对应的元素具体数据
result.loc[result['维度'].isin(top_n['维度']),:]

Unnamed: 0,维度,元素,S,EP,EP_sum,EP_thres
0,渠道,B,0.017722,0.266667,0.266667,1.0
1,渠道,A,0.004543,0.373333,0.64,1.0
2,渠道,C,0.00167,0.36,1.0,1.0


In [21]:
def JS_divergence(row):
    p, q = row['p'], row['q']
    M=(p+q)/2

    js=0.5*(p*np.log(p/M))+0.5*(q*np.log(q/M))

    return round(float(js),6)

def adtributor(df, element_threshold=0.2, dimensions_threshold=0.8, n=1):
    pre_sum = data['before'].sum() / len(data['维度'].unique()) 
    aft_sum = data['after'].sum() / len(data['维度'].unique()) 

    #计算p和q值
    data['p'] = data['before'] / pre_sum
    data['q'] = data['after'] / aft_sum  

    data['surprise'] = data.apply(JS_divergence,axis=1)
    data.sort_values(['维度', 'surprise'], ascending=False, inplace=True)
    
    #计算出总销售波动，3个维度都在一起，因此也需要除以维度数量
    sum_dif = (data['after'].sum() - data['before'].sum()) / len(data['维度'].unique()) 

    #计算每一行数据的EP
    data['EP'] = (data['after'] - data['before']) / sum_dif
    
    teep = element_threshold
    tep = dimensions_threshold
    #筛选出EP值大于单个EP阈值teep的元素
    data_fil = data.loc[data['EP'] >= teep,['维度','元素','surprise','EP']]

    #新建一个EP_sum列，即每个维度内EP值做累加，作为和总阈值tep对比的辅助列
    data_fil['EP_sum'] = data_fil.groupby('维度')['EP'].cumsum()
    
    
    #先筛选出大于总阈值的数据
    bri = data_fil.loc[data_fil['EP_sum'] >= tep,:]

    #每个维度下，把超过总阈值的第一个累加EP值作为接下来的筛选门槛
    bri_dim = bri.groupby('维度').head(1)[['维度','EP_sum']]

    #把经过单个阈值teep筛选后的数据和每个维度筛选门槛相匹配，用于下一步计算，空缺值用总阈值tep填充
    result = pd.merge(data_fil,bri_dim,left_on = '维度',right_on = '维度',how = 'left').fillna(tep)
    result.columns = ['维度','元素','S','EP','EP_sum','EP_thres']

    #剔除大于筛选门槛的数据，即筛选出小于或等于筛选门槛的数据
    result = result.loc[result['EP_sum'] <= result['EP_thres'],:]
    
    #维度贡献结果
    result_gp = result.groupby('维度')['S'].sum().reset_index()
    #每个维度按照surprise排序，并返回前n个维度
    top_n = result_gp.sort_values('S',ascending = False).iloc[:n,:]

    #根据选择的前n个维度，返回维度对应的元素具体数据
    final = result.loc[result['维度'].isin(top_n['维度']),:]
    return final

In [22]:
result = adtributor(data, 0.2, 0.8, 1)
result

Unnamed: 0,维度,元素,S,EP,EP_sum,EP_thres
0,渠道,B,0.017722,0.266667,0.266667,1.0
1,渠道,A,0.004543,0.373333,0.64,1.0
2,渠道,C,0.00167,0.36,1.0,1.0
