# 用户用收入金额进行分组

主要是针对R1和R7对用户进行分组   
主要分为两个部分，一个部分进行区分，另一个部分进行校验   

区分的部分目前大致思路有两个，一个按照总金额划分，一个按照人数划分   
校验部分暂时选用之前的方式，用iOS整体数据进行验证

`jupyter-lab --allow-root --ip 192.168.40.62`

In [None]:
import pandas as pd
import os
import sys
sys.path.append('/src')
from src.maxCompute import execSql
from src.tools import getFilename

In [None]:
# userDf 要求每个用户一行，必须包含install_date列
# cvMapDf 要求必须包含 cv,min_event_revenue,max_event_revenue列
# usd 是userDf中需要转化分组美元金额的列，一般是r1usd或者r7usd
def checkCvMap(userDf,cvMapDf,usd = 'r1usd'):
    import copy
    df = copy.deepcopy(userDf)
    df.loc[:,'cv'] = 0
    for i in range(len(cvMapDf)):
        min_event_revenue = cvMapDf.min_event_revenue[i]
        max_event_revenue = cvMapDf.max_event_revenue[i]
        if pd.isna(max_event_revenue):
            continue
        df.loc[
            (df[usd] > min_event_revenue) & (df[usd] <= max_event_revenue),
            'cv'
        ] = i
    df.loc[
        (df[usd] > max_event_revenue),
        'cv'
    ] = len(cvMapDf)-1

    df.loc[:,'cv_usd'] = 0
    for i in range(len(cvMapDf)):
        min_event_revenue = cvMapDf.min_event_revenue[i]
        max_event_revenue = cvMapDf.max_event_revenue[i]
        avg = (min_event_revenue + max_event_revenue)/2
        if pd.isna(max_event_revenue):
            avg = 0
        if avg < 0:
            avg = 0
        df.loc[df.cv == i,'cv_usd'] = avg
    
    # print(df)
    mergeDf = df.groupby('install_date',as_index=False).agg({usd:'sum','cv_usd':'sum'})
    # print(mergeDf)
    # 计算mergeDf中usd列与'cv_usd'列的mape 和 r2_score
    from sklearn.metrics import mean_absolute_percentage_error, r2_score

    mape = mean_absolute_percentage_error(mergeDf[usd], mergeDf['cv_usd'])
    r2 = r2_score(mergeDf[usd], mergeDf['cv_usd'])

    print(f"MAPE: {mape}")
    print(f"R2 Score: {r2}")

In [None]:
# levels 没有0档位，类似 
# levels = [
#     2.4707,5.2468,18.9076,47.7314,94.9377,193.1167,234.78
# ]
def makeCvMap(levels):
    mapData = {
        'cv':[0],
        'min_event_revenue':[-1],
        'max_event_revenue':[0]
    }
    for i in range(len(levels)):
        mapData['cv'].append(len(mapData['cv']))
        mapData['min_event_revenue'].append(mapData['max_event_revenue'][len(mapData['max_event_revenue'])-1])
        mapData['max_event_revenue'].append(levels[i])

    cvMapDf = pd.DataFrame(data=mapData)
    return cvMapDf

In [None]:
# 这套方案的结论超越AF建议
# 下面的注释直接放到Cursor中，获得下面代码，略有bug

# userDf中列usd代表付费金额
# 求数组levels，数组长度为N
# 所有usd > 0 的用户按照levels范围进行分组，分成N组，即第一组用户usd >0 并且usd < levels[0],第二组用户 usd >= levels[0] 并且 usd < levels[1]，以此类推，大于levels[N-1]的用户也分到最后一组
# 要求每组用户的usd的和尽量接近
def makeLevels1(userDf,usd = 'r1usd',N = 7):
    df = userDf.sort_values([usd])
    # Filter out users with usd <= 0
    filtered_df = df[df[usd] > 0]

    # Calculate the total usd for all users
    total_usd = filtered_df[usd].sum()

    # Calculate the target usd for each group
    target_usd = total_usd / N

    # Initialize the levels array with zeros
    levels = [0] * (N - 1)

    # Initialize the current usd and group index
    current_usd = 0
    group_index = 0

    # Loop through each user and assign them to a group
    for index, row in filtered_df.iterrows():
        current_usd += row[usd]
        if current_usd >= target_usd:
            levels[group_index] = row[usd]
            current_usd = 0
            group_index += 1
            if group_index == N - 1:
                break

    return levels

In [None]:
# makeLevels1 test
# df = pd.read_csv(getFilename('iosCvCount20220701_20230201'))
# df = df.loc[df.install_date >= '2022-07-01']
# df = df.sort_values(['install_date','r1usd'])
# levels = makeLevels1(df)
# print(levels)

In [None]:
def test():
    df = pd.read_csv(getFilename('iosCvCount20220701_20230201'))
    df = df.loc[df.install_date >= '2022-07-01']
    df = df.sort_values(['install_date','r1usd'])

    cvMapDf = makeCvMap([2.4707,5.2468,18.9076,47.7314,94.9377,193.1167,234.78])
    print(cvMapDf)

    checkCvMap(df,cvMapDf,usd = 'r1usd')


In [None]:
# 按照cvMap添加cv到cvName列
def addCV(df,cvMapDf,usd='r1usd',cvName = 'cv'):
    df.loc[:,cvName] = 0
    for i in range(len(cvMapDf)):
        min_event_revenue = cvMapDf.min_event_revenue[i]
        max_event_revenue = cvMapDf.max_event_revenue[i]
        if pd.isna(max_event_revenue):
            continue
        df.loc[
            (df[usd] > min_event_revenue) & (df[usd] <= max_event_revenue),
            cvName
        ] = i
    df.loc[
        (df[usd] > max_event_revenue),
        cvName
    ] = len(cvMapDf)-1
    return df

In [None]:
def getDataFromMC():
    sql = '''
        select
            customer_user_id,
            to_char(
                to_date(install_time, "yyyy-mm-dd hh:mi:ss"),
                "yyyy-mm-dd"
            ) as install_date,
            sum(
                case
                    when event_timestamp - install_timestamp <= 1 * 24 * 3600 then cast (event_revenue_usd as double)
                    else 0
                end
            ) as r1usd,
            sum(
                case
                    when event_timestamp - install_timestamp <= 7 * 24 * 3600 then cast (event_revenue_usd as double)
                    else 0
                end
            ) as r7usd
        from
            ods_platform_appsflyer_events
        where
            app_id = 'com.topwar.gp'
            and event_name = 'af_purchase'
            and zone = 0
            and day >= 20220701
            and day <= 20230201
        group by
            install_date,
            customer_user_id
    '''

    df = execSql(sql)
    return df


In [None]:
# 获取原始数据，需要跑mc，比较慢，可以跳过
def step1():
    df = getDataFromMC()
    df.to_csv(getFilename('aosCvR1R7_20220701_20230201'))

In [None]:
step1()

In [None]:
# 将用户打入标签cv1和cv2，代表r1和r7的用户分组
def step2():
    df = pd.read_csv(getFilename('aosCvR1R7_20220701_20230201'))
    df = df.loc[df.install_date >= '2022-07-01']
    df = df.sort_values(['install_date','r1usd'])

    levels1 = makeLevels1(df,usd = 'r1usd',N=8)
    levels7 = makeLevels1(df,usd = 'r7usd',N=8)

    cvMapDf1 = makeCvMap(levels1)
    cvMapDf7 = makeCvMap(levels7)

    cvMapDf1.to_csv(getFilename('cvMap1'))
    cvMapDf7.to_csv(getFilename('cvMap7'))

    # 测试map的准确性
    checkCvMap(df,cvMapDf1,usd = 'r1usd')
    checkCvMap(df,cvMapDf7,usd = 'r7usd')

    df = addCV(df,cvMapDf1,usd = 'r1usd',cvName = 'cv1')
    df = addCV(df,cvMapDf7,usd = 'r7usd',cvName = 'cv7')
    
    # df.to_csv(getFilename('iosCvCount20220701_20230201_cv1cv7'))
    df.to_csv(getFilename('aosCvCount20220701_20230201_cv1cv7'))


In [None]:
step2()

In [None]:
# 按照cv1和cv7进行分组，然后计算相关度
def step3():
    # df = pd.read_csv(getFilename('iosCvCount20220701_20230201_cv1cv7'))
    df = pd.read_csv(getFilename('aosCvCount20220701_20230201_cv1cv7'))
    # 计算原有相关度
    groupByDayDf = df.groupby(by = ['install_date'],as_index=False).agg({'r1usd':'sum','r7usd':'sum'})
    groupByDayCorr = groupByDayDf.corr()
    print('groupByDayCorr:',groupByDayCorr)

    groupByDayDf = df.groupby(by = ['install_date','cv1'],as_index=False).agg({'r1usd':'sum','r7usd':'sum'})
    for cv1 in list(groupByDayDf['cv1'].unique()):
        groupByDayDf1 = groupByDayDf.loc[groupByDayDf.cv1 == cv1]
        groupByDayCorr = groupByDayDf1.corr() 
        print('groupByDayCorr cv1:',cv1,'\n',groupByDayCorr)

    # 分组
    for cv1 in range(8):
        for cv7 in range(8):
            cvDf = df.loc[(df.cv1 == cv1) & (df.cv7 == cv7)].reset_index(drop = True)
            if(len(cvDf) > 0):
                print('groupByCV %d %d:\n'%(cv1,cv7))
                # print(cvDf)
                print('共计%d人:'%(len(cvDf)))

                groupByCVDf = cvDf.groupby(by = ['install_date','cv1','cv7'],as_index=False).agg({'r1usd':'sum','r7usd':'sum'})                
                print('Corr:\n',groupByCVDf.corr())
            

In [None]:
step3()

# 暂时总结
观察后发现，大体上还是可以得到一些较大的群组。   
但是存在一些较小的群组，比如 0 7组人数就很少。   
人数多的群组相关性足够高，超过95%，人数少的相关性偏低，甚至出现负相关。   
目前只分了64组，如果可以考虑忽略掉一部分小概率人群，可能可以让相关性上升。   
一旦涉及到cv7 == 7 的情况，就会出现负相关，所以考虑对大R进行削弱，`设定大R上限`。   

# 新思路 速记

鉴于将用户有效的分为64组（或更少），可以分开对用户进行预测。
即根据首日付费金额 + 预测结果 -> CV

这种思路主要问题：
cv1分组和cv7分组的合理性，更加有效的将用户划分为64组。   
可能不能均分，并不能直接cv1 x cv7，需要将增长率相似的用户合并成一组，腾出更多的空间使那些增长率不稳定的或者大R更多的选择。

这个可能暂时没有什么思路怎么动手，但是可以先将cv1和cv7的档位放大，做出更多的可能档位，然后尝试进行合并。   
可以合并线性相关度较高的相邻cv7组别，合并后线性相关性会下降，观察下降后的相关度是否足以继续合并（比如定个阈值90%）


In [None]:
# 尝试对cv7进行合并
# def catCv7(df,t = 0.9):
# df中有列'install_date','cv1,'cv7','r1usd,'r7usd'
# 并尝试尽量合并相邻的cv7并记作cv7m
# 要求合并后按install_date,cv1 和 cv7m进行groupby之后的'r1usd,'r7usd'的相关系数corr > t

def catCv7(t = 0.9):
    df = pd.read_csv(getFilename('iosCvCount20220701_20230201_cv1cv7'))

    cv1List = list(df['cv1'].unique())
    cv7List = list(df['cv7'].unique())
    cv1List.sort()
    cv7List.sort()
    
    for cv1 in cv1List:
        # 要尝试合并cv7 ，并把可以合并的cv7 标记为 cv7m
        if cv1 == 0:
            continue
        cv7Group = []            
        for cv7 in cv7List:
            cv7Group.append(cv7)
            cvDf = df.loc[(df.cv1 == cv1) & (df.cv7.isin(cv7Group))].reset_index(drop = True)
            if(len(cvDf) > 0):
                groupByCVDf = cvDf.groupby(by = ['install_date'],as_index=False).agg({'r1usd':'sum','r7usd':'sum'})
                corr = groupByCVDf.corr()['r1usd']['r7usd']
                print(cv7Group,corr)
                if corr < t or cv7 == cv7List[len(cv7List)-1]:
                    print('!')
                    if len(cv7Group) > 1:
                        cv7Group.pop()
                    cvDf = df.loc[(df.cv1 == cv1) & (df.cv7.isin(cv7Group))].reset_index(drop = True)
                    groupByCVDf = cvDf.groupby(by = ['install_date'],as_index=False).agg({'r1usd':'sum','r7usd':'sum'})
                    corr = groupByCVDf.corr()['r1usd']['r7usd']
                    print('cv1:',cv1,'cv7Group:',cv7Group,'corr:',corr)
                    cv7Group = []
        

In [None]:
catCv7()