按照app_package,country_group,mediasource分组，计算平均N天的均值，并添加tag：avg_{N}


In [0]:
def createAvgNView(N=21):
    viewName = f"data_science.default.lw_20250903_cohort_onlyprofit_avg_{N}_view_by_j"
    sql = f"""
CREATE OR REPLACE VIEW {viewName} as
with base_data as (
    select
        app_package,
        country_group,
        mediasource,
        tag,
        install_day,
        cost,
        revenue_d1,
        revenue_d3,
        revenue_d7,
        revenue_d14,
        revenue_d30,
        revenue_d60,
        revenue_d90,
        revenue_d120,
        revenue_d135,
        revenue_d150,
        -- 计算每个分组内有多少天的数据
        count(*) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as days_count
    from
        (
			select * from data_science.default.lw_20250903_aos_gpir_cohort_onlyprofit_raw_table_by_j
			union all
			select * from data_science.default.lw_20250903_ios_af_cohort_onlyprofit_fit_table_by_j
        )
        
),
averaged_data as (
    select
        app_package,
        country_group,
        mediasource,
        tag,
        install_day,
        -- 只有当有足够N天数据时才计算平均值，否则为NULL
        case 
            when days_count >= {N} then 
                avg(cost) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_cost,
        case 
            when days_count >= {N} then 
                avg(revenue_d1) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d1,
        case 
            when days_count >= {N} then 
                avg(revenue_d3) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d3,
        case 
            when days_count >= {N} then 
                avg(revenue_d7) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d7,
        case 
            when days_count >= {N} then 
                avg(revenue_d14) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d14,
        case 
            when days_count >= {N} then 
                avg(revenue_d30) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d30,
        case 
            when days_count >= {N} then 
                avg(revenue_d60) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d60,
        case 
            when days_count >= {N} then 
                avg(revenue_d90) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d90,
        case 
            when days_count >= {N} then 
                avg(revenue_d120) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d120,
        case 
            when days_count >= {N} then 
                avg(revenue_d135) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d135,
        case 
            when days_count >= {N} then 
                avg(revenue_d150) over (
                    partition by app_package, country_group, mediasource, tag
                    order by install_day 
                    rows between {N-1} preceding and current row
                )
            else null 
        end as avg_revenue_d150
    from
        base_data
)
select
    app_package,
    country_group,
    mediasource,
    'avg_{N}' as tag,
    install_day,
    avg_cost as cost,
    avg_revenue_d1 as revenue_d1,
    avg_revenue_d3 as revenue_d3,
    avg_revenue_d7 as revenue_d7,
    avg_revenue_d14 as revenue_d14,
    avg_revenue_d30 as revenue_d30,
    avg_revenue_d60 as revenue_d60,
    avg_revenue_d90 as revenue_d90,
    avg_revenue_d120 as revenue_d120,
    avg_revenue_d135 as revenue_d135,
    avg_revenue_d150 as revenue_d150
from
    averaged_data;
    """
    print(f"Executing SQL: {sql}")
    spark.sql(sql)
    return viewName



按照app_package,country_group,mediasource分组，计算平均N天的移动均值，并添加tag：ema_{N}


In [0]:
def createEMAView(N=21):
    alpha = 2.0 / (N + 1)  # 平滑因子
    one_minus_alpha = 1 - alpha
    viewName = f"data_science.default.lw_20250903_cohort_onlyprofit_ema_{N}_view_by_j"
    sql = f"""
CREATE OR REPLACE VIEW {viewName} as
with base_data as (
    select
        app_package,
        country_group,
        mediasource,
        tag,
        install_day,
        cost,
        revenue_d1,
        revenue_d3,
        revenue_d7,
        revenue_d14,
        revenue_d30,
        revenue_d60,
        revenue_d90,
        revenue_d120,
        revenue_d135,
        revenue_d150,
        row_number() over (
            partition by app_package, country_group, mediasource, tag
            order by install_day
        ) as rn
    from
        (
			select * from data_science.default.lw_20250903_aos_gpir_cohort_onlyprofit_raw_table_by_j
			union all
			select * from data_science.default.lw_20250903_ios_af_cohort_onlyprofit_fit_table_by_j
        )
),
windowed_data as (
    select 
        *,
        -- 为窗口内每行计算相对位置（0到N-1）
        collect_list(cost) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as cost_window,
        collect_list(revenue_d1) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d1_window,
        collect_list(revenue_d3) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d3_window,
        collect_list(revenue_d7) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d7_window,
        collect_list(revenue_d14) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d14_window,
        collect_list(revenue_d30) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d30_window,
        collect_list(revenue_d60) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d60_window,
        collect_list(revenue_d90) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d90_window,
        collect_list(revenue_d120) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d120_window,
        collect_list(revenue_d135) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d135_window,
        collect_list(revenue_d150) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        ) as revenue_d150_window,
        size(collect_list(cost) over (
            partition by app_package, country_group, mediasource, tag
            order by install_day 
            rows between {N-1} preceding and current row
        )) as window_size
    from base_data
)
select
    app_package,
    country_group,
    mediasource,
    'ema_{N}' as tag,
    install_day,
    -- 只有当窗口大小达到N时才计算EMA
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + cost_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as cost,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d1_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d1,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d3_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d3,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d7_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d7,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d14_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d14,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d30_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d30,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d60_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d60,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d90_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d90,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d120_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d120,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d135_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d135,
    
    case when window_size >= {N} then
        aggregate(
            sequence(0, {N-1}),
            cast(0.0 as double),
            (acc, i) -> acc + revenue_d150_window[i] * {alpha} * power({one_minus_alpha}, {N-1} - i)
        ) / (1 - power({one_minus_alpha}, {N}))
    else null end as revenue_d150
from
    windowed_data;
    """
    print(f"Executing SQL for EMA with N={N}, alpha={alpha:.4f}")
    spark.sql(sql)
    return viewName



将view都合并到一起，以便后续处理。


In [0]:
def createOnlyprofitAllFuncView(viewNames=None):
    """
    创建汇总视图，动态读取视图名称列表
    
    Args:
        viewNames: 视图名称列表，如果为None则使用默认配置
    """
    if viewNames is None:
        # 默认配置
        viewNames = [
            'lw_20250903_aos_gpir_cohort_onlyprofit_avg_28_view_by_j',
        ]
    
    # 动态构建UNION ALL语句
    union_statements = []
    for view_name in viewNames:
        union_statements.append(f"SELECT\n*\nFROM {view_name}")
    
    union_sql = "\nUNION ALL\n".join(union_statements)
    
    sql = f"""
CREATE OR REPLACE VIEW data_science.default.lw_20250903_onlyprofit_all_func_view_by_j as
{union_sql}
;
    """
    print(f"Executing SQL: {sql}")
    spark.sql(sql)
    return



In [0]:
# 创建各种视图
avg_views = []
ema_views = []

for n in [28, 56, 84]:
    view_name = createAvgNView(n)
    avg_views.append(view_name)
    view_name = createEMAView(n)
    ema_views.append(view_name)

# 合并所有视图名称
all_view_names = avg_views + ema_views

# 动态创建汇总视图
createOnlyprofitAllFuncView(all_view_names)
