In [0]:
%sql
USE data_science.default

尝试从目前数据中获得尽可能多的特征：
1、24小时内的付费次数、付费总金额，最大单次付费金额
2、48小时内的付费次数、付费总金额，最大单次付费金额
3、72小时内的付费次数、付费总金额，最大单次付费金额
以及最终的结果：
168小时付费金额

创建一个view来方便后续快速获取数据,lw_20250820_aos_gpir_uid_revenue_view3_by_j

这部分代码在serverless中运行，以便于快一点。

In [0]:
%sql
CREATE OR REPLACE VIEW lw_20250820_aos_gpir_uid_revenue_view3_by_j AS
select
	uid,
	install_day,
	-- country,
	country_group,
	CASE
		WHEN mediasource = 'applovin_int'
		AND UPPER(campaign_name) LIKE '%D7%' THEN 'applovin_int_d7'
		WHEN mediasource = 'applovin_int'
		AND UPPER(campaign_name) LIKE '%D28%' THEN 'applovin_int_d28'
		WHEN mediasource IN (
			'googleadwords_int',
			'Facebook Ads',
			'bytedanceglobal_int',
			'snapchat_int',
			'moloco_int'
		) THEN mediasource
		ELSE 'other'
	END as mediasource,
	campaign_id,
  -- campaign_name,
	payment_count_24h,
	revenue_24h as revenue_24h,
	max_payment_24h,
	payment_count_48h,
	revenue_48h,
	max_payment_48h,
	payment_count_72h,
	revenue_72h as revenue_72h,
	max_payment_72h,
	revenue_168h as revenue_d7
from
	(
		select
			t1.uid,
			t1.install_timestamp,
			date_format(from_unixtime(t1.install_timestamp), 'yyyyMMdd') as install_day,
			t1.country,
			COALESCE(cg.country_group, 'other') AS country_group,
			t1.mediasource,
			t1.campaign_id,
      pub.campaign_name as campaign_name,
			count(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 24 * 60 * 60 then t2.revenue_value_usd end) as payment_count_24h,
			sum(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 24 * 60 * 60 then t2.revenue_value_usd else 0 end) as revenue_24h,
			max(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 24 * 60 * 60 then t2.revenue_value_usd else 0 end) as max_payment_24h,
			count(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 48 * 60 * 60 then t2.revenue_value_usd end) as payment_count_48h,
			sum(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 48 * 60 * 60 then t2.revenue_value_usd else 0 end) as revenue_48h,
			max(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 48 * 60 * 60 then t2.revenue_value_usd else 0 end) as max_payment_48h,
			count(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 72 * 60 * 60 then t2.revenue_value_usd end) as payment_count_72h,
			sum(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 72 * 60 * 60 then t2.revenue_value_usd else 0 end) as revenue_72h,
			max(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 72 * 60 * 60 then t2.revenue_value_usd else 0 end) as max_payment_72h,
			sum(case when (t2.event_time / 1000 - t1.install_timestamp) between 0 and 168 * 60 * 60 then t2.revenue_value_usd else 0 end) as revenue_168h
		from
			marketing.attribution.dws_overseas_gpir_unique_uid t1
			left join marketing.attribution.dwd_overseas_revenue_allproject t2 on t1.app = t2.app
			and t1.uid = t2.uid
			LEFT JOIN lw_country_group_table_by_j_20250703 cg ON t1.country = cg.country
			LEFT JOIN (
				SELECT
					campaign_id,
					MAX(campaign_name) AS campaign_name
				FROM
					prodb.public.applovin_campaign_info_new
				GROUP BY
					campaign_id
			) pub ON t1.campaign_id = pub.campaign_id
		where
			t1.app = 502
			and t1.app_package = 'com.fun.lastwar.gp'
		group by
			t1.uid,
			t1.install_timestamp,
			t1.country,
			COALESCE(cg.country_group, 'other'),
			t1.mediasource,
			t1.campaign_id,
			pub.campaign_name
	)
-- where
-- 	revenue_168h > 0
-- order by
-- 	revenue_168h desc
;

In [0]:
%sql
-- 删除现有VIEW
DROP VIEW IF EXISTS lw_20250827_traindata_for_3p7_gbt_by_j;

-- 创建物化表
CREATE OR REPLACE TABLE lw_20250827_traindata_for_3p7_gbt_by_j
USING DELTA  -- 使用Delta格式获得更好性能
PARTITIONED BY (country_group)  -- 按country_group分区
AS
select
  count(uid) as users,
  country_group,
  payment_count_24h,
  ROUND(revenue_24h) as revenue_24h,
  ROUND(max_payment_24h) as max_payment_24h,
  payment_count_48h,
  ROUND(revenue_48h) as revenue_48h,
  ROUND(max_payment_48h) as max_payment_48h,
  payment_count_72h,
  ROUND(revenue_72h) as revenue_72h,
  ROUND(max_payment_72h) as max_payment_72h,
  ROUND(revenue_d7) as revenue_d7
from lw_20250820_aos_gpir_uid_revenue_view3_by_j
where install_day between 20250101 and 20250615
GROUP BY
  country_group,
  payment_count_24h,
  ROUND(revenue_24h),
  ROUND(max_payment_24h),
  payment_count_48h,
  ROUND(revenue_48h),
  ROUND(max_payment_48h),
  payment_count_72h,
  ROUND(revenue_72h),
  ROUND(max_payment_72h),
  ROUND(revenue_d7);

-- 优化表统计信息
ANALYZE TABLE lw_20250827_traindata_for_3p7_gbt_by_j COMPUTE STATISTICS;

In [0]:
%sql
select 
  users,
  country_group,
  payment_count_24h,
  revenue_24h,
  max_payment_24h,
  payment_count_48h,
  revenue_48h,
  max_payment_48h,
  payment_count_72h,
  revenue_72h,
  max_payment_72h,
  revenue_d7
from lw_20250827_traindata_for_3p7_gbt_by_j
where country_group = 'US'
and revenue_d7 = 0
;

数据准备和探索

In [0]:
# 首先准备数据
sql = '''
select 
  users,
  country_group,
  payment_count_24h,
  revenue_24h,
  max_payment_24h,
  payment_count_48h,
  revenue_48h,
  max_payment_48h,
  payment_count_72h,
  revenue_72h,
  max_payment_72h,
  revenue_d7
from lw_20250827_traindata_for_3p7_gbt_by_j
where revenue_d7 is not null  -- 确保目标变量不为空
'''

df = spark.sql(sql)

# 检查数据基本信息
print("=== 数据基本信息 ===")
print(f"总行数: {df.count()}")
print(f"总列数: {len(df.columns)}")

# 查看各country_group的数据分布
print("\n=== Country Group 分布 ===")
from pyspark.sql.functions import count, avg, stddev

country_stats = df.groupBy("country_group").agg(
    count("*").alias("sample_count"),
    avg("revenue_d7").alias("avg_revenue_d7"),
    stddev("revenue_d7").alias("std_revenue_d7")
).orderBy("sample_count", ascending=False)

display(country_stats)

数据质量检查

In [0]:
# 检查空值和异常值
print("=== 数据质量检查 ===")

# 检查空值
from pyspark.sql.functions import col, count, when, isnan, isnull

null_counts = df.select([
    count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) 
    for c in df.columns
])
print("空值统计:")
display(null_counts)

# 检查基本统计信息
print("\n基本统计信息:")
display(df.describe())

# 检查每个country_group的样本数量（确保有足够数据训练）
min_samples_required = 100  # 设定最小样本数
valid_countries = df.groupBy("country_group").count().filter(col("count") >= min_samples_required)
print(f"\n样本数 >= {min_samples_required} 的country_group:")
display(valid_countries)

定义训练函数

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
import time

def train_gbt_model_for_country(country_data, country_name):
    """
    为单个country_group训练GBT模型
    """
    print(f"\n{'='*50}")
    print(f"开始训练 Country Group: {country_name}")
    print(f"样本数量: {country_data.count()}")
    
    # 定义特征列
    feature_cols = [
        'payment_count_24h', 'revenue_24h', 'max_payment_24h',
        'payment_count_48h', 'revenue_48h', 'max_payment_48h', 
        'payment_count_72h', 'revenue_72h', 'max_payment_72h'
    ]
    
    # 检查特征列是否存在
    missing_cols = [col for col in feature_cols if col not in country_data.columns]
    if missing_cols:
        print(f"警告: 缺少特征列 {missing_cols}")
        return None
    
    # 数据分割
    train_data, test_data = country_data.randomSplit([0.8, 0.2], seed=42)
    
    print(f"训练集样本数: {train_data.count()}")
    print(f"测试集样本数: {test_data.count()}")
    
    # 创建特征向量
    vectorAssembler = VectorAssembler(
        inputCols=feature_cols, 
        outputCol="features"
    )
    
    # 创建GBT回归器，使用users作为权重
    gbt = GBTRegressor(
        labelCol="revenue_d7",
        featuresCol="features",
        weightCol="users",  # 使用users作为权重
        maxIter=20,
        maxDepth=5,
        seed=42
    )
    
    # 创建管道
    pipeline = Pipeline(stages=[vectorAssembler, gbt])
    
    # 训练模型
    start_time = time.time()
    model = pipeline.fit(train_data)
    training_time = time.time() - start_time
    
    print(f"训练完成，耗时: {training_time:.2f} 秒")
    
    # 预测和评估
    predictions = model.transform(test_data)
    
    # 计算评估指标
    evaluator_rmse = RegressionEvaluator(
        labelCol="revenue_d7", 
        predictionCol="prediction", 
        metricName="rmse"
    )
    
    evaluator_mae = RegressionEvaluator(
        labelCol="revenue_d7", 
        predictionCol="prediction", 
        metricName="mae"
    )
    
    evaluator_r2 = RegressionEvaluator(
        labelCol="revenue_d7", 
        predictionCol="prediction", 
        metricName="r2"
    )
    
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)
    
    print(f"模型评估结果:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R²: {r2:.4f}")
    
    # 返回结果
    return {
        'country_group': country_name,
        'model': model,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'training_time': training_time,
        'train_samples': train_data.count(),
        'test_samples': test_data.count(),
        'predictions': predictions
    }

批量训练模型

In [0]:
# 获取所有country_group
countries = df.select("country_group").distinct().rdd.map(lambda row: row[0]).collect()
print(f"需要训练的country_group数量: {len(countries)}")
print(f"Country groups: {countries}")

# 存储所有模型结果
model_results = {}
training_summary = []

# 逐个训练模型
for i, country in enumerate(countries, 1):
    print(f"\n进度: {i}/{len(countries)}")
    
    try:
        # 过滤当前country的数据
        country_data = df.filter(col("country_group") == country)
        
        # 检查样本数量
        sample_count = country_data.count()
        if sample_count < 50:  # 设定最小样本数阈值
            print(f"跳过 {country}: 样本数太少 ({sample_count})")
            continue
        
        # 训练模型
        result = train_gbt_model_for_country(country_data, country)
        
        if result:
            model_results[country] = result
            training_summary.append({
                'country_group': country,
                'rmse': result['rmse'],
                'mae': result['mae'],
                'r2': result['r2'],
                'training_time': result['training_time'],
                'train_samples': result['train_samples'],
                'test_samples': result['test_samples']
            })
            
            print(f"✅ {country} 训练成功")
        else:
            print(f"❌ {country} 训练失败")
            
    except Exception as e:
        print(f"❌ {country} 训练出错: {str(e)}")
        continue
    
    # 每训练5个模型后显示进度摘要
    if i % 5 == 0:
        print(f"\n--- 进度摘要 (已完成 {len(model_results)}/{i}) ---")
        if training_summary:
            avg_rmse = sum([r['rmse'] for r in training_summary]) / len(training_summary)
            avg_r2 = sum([r['r2'] for r in training_summary]) / len(training_summary)
            print(f"平均 RMSE: {avg_rmse:.4f}")
            print(f"平均 R²: {avg_r2:.4f}")

print(f"\n🎉 所有训练完成! 成功训练了 {len(model_results)} 个模型")

训练结果汇总

In [0]:
# 创建训练结果汇总表
if training_summary:
    summary_df = spark.createDataFrame(training_summary)
    
    print("=== 训练结果汇总 ===")
    display(summary_df.orderBy("r2", ascending=False))
    
    # 计算整体统计
    print("\n=== 整体统计 ===")
    summary_stats = summary_df.agg(
        avg("rmse").alias("avg_rmse"),
        avg("mae").alias("avg_mae"), 
        avg("r2").alias("avg_r2"),
        avg("training_time").alias("avg_training_time")
    )
    display(summary_stats)
    
    # 找出表现最好和最差的模型
    best_model = summary_df.orderBy("r2", ascending=False).first()
    worst_model = summary_df.orderBy("r2", ascending=True).first()
    
    print(f"\n表现最好的模型: {best_model['country_group']} (R² = {best_model['r2']:.4f})")
    print(f"表现最差的模型: {worst_model['country_group']} (R² = {worst_model['r2']:.4f})")

生成预测结果表

In [0]:
# 使用训练好的模型对汇总数据进行预测
def generate_predictions_for_all_countries(model_results, df):
    """
    使用所有训练好的模型生成预测结果
    """
    all_predictions = []
    
    for country, model_info in model_results.items():
        print(f"正在为 {country} 生成预测...")
        
        # 获取该country的数据
        country_data = df.filter(col("country_group") == country)
        
        if country_data.count() == 0:
            print(f"跳过 {country}: 无数据")
            continue
            
        try:
            # 使用模型进行预测
            predictions = model_info['model'].transform(country_data)
            
            # 选择需要的列，包括预测结果
            result = predictions.select(
                "country_group",
                "users", 
                "payment_count_24h", "revenue_24h", "max_payment_24h",
                "payment_count_48h", "revenue_48h", "max_payment_48h", 
                "payment_count_72h", "revenue_72h", "max_payment_72h",
                "revenue_d7",  # 实际值
                col("prediction").alias("predicted_revenue_d7")  # 预测值
            )
            
            all_predictions.append(result)
            print(f"✅ {country} 预测完成，样本数: {result.count()}")
            
        except Exception as e:
            print(f"❌ {country} 预测失败: {str(e)}")
            continue
    
    # 合并所有预测结果
    if all_predictions:
        final_predictions = all_predictions[0]
        for pred in all_predictions[1:]:
            final_predictions = final_predictions.union(pred)
        
        return final_predictions
    else:
        return None

# 生成预测结果
print("开始生成所有country的预测结果...")
predictions_df = generate_predictions_for_all_countries(model_results, df)

if predictions_df:
    print(f"预测结果生成完成，总样本数: {predictions_df.count()}")
    
    # 查看预测结果样例
    print("\n预测结果样例:")
    display(predictions_df.head(10))
else:
    print("预测结果生成失败")

In [0]:
# 注册为临时视图
predictions_df.createOrReplaceTempView("temp_predictions")

In [0]:
%sql
-- 创建预测结果物化表
CREATE OR REPLACE TABLE lw_20250827_gbt_predictions_by_j
USING DELTA
PARTITIONED BY (country_group)
AS
SELECT 
  country_group,
  users,
  payment_count_24h,
  revenue_24h,
  max_payment_24h,
  payment_count_48h,
  revenue_48h,
  max_payment_48h,
  payment_count_72h,
  revenue_72h,
  max_payment_72h,
  revenue_d7 as actual_revenue_d7,
  predicted_revenue_d7,
  (predicted_revenue_d7 - revenue_d7) as prediction_error,
  abs(predicted_revenue_d7 - revenue_d7) as abs_prediction_error,
  CASE 
    WHEN revenue_d7 > 0 THEN abs(predicted_revenue_d7 - revenue_d7) / revenue_d7 * 100
    ELSE NULL 
  END as percentage_error
FROM temp_predictions;

-- 优化表
ANALYZE TABLE lw_20250827_gbt_predictions_by_j COMPUTE STATISTICS;

-- 查看预测结果概览
SELECT 
  country_group,
  COUNT(*) as sample_count,
  AVG(actual_revenue_d7) as avg_actual_revenue,
  AVG(predicted_revenue_d7) as avg_predicted_revenue,
  AVG(prediction_error) as avg_error,
  AVG(abs_prediction_error) as avg_abs_error,
  AVG(percentage_error) as avg_percentage_error
FROM lw_20250827_gbt_predictions_by_j
GROUP BY country_group
ORDER BY sample_count DESC;

In [0]:
%sql
-- 创建用户级别的预测结果表
-- 将汇总预测结果分配回原始用户
CREATE OR REPLACE TABLE lw_20250827_user_level_predictions_by_j
USING DELTA
PARTITIONED BY (country_group, install_day)
AS
WITH user_predictions AS (
  SELECT 
    o.uid,
    o.country_group,
    o.install_day,
    o.payment_count_24h,
    o.revenue_24h,
    o.max_payment_24h,
    o.payment_count_48h,
    o.revenue_48h,
    o.max_payment_48h,
    o.payment_count_72h,
    o.revenue_72h,
    o.max_payment_72h,
    o.revenue_d7 as actual_revenue_d7,
    p.predicted_revenue_d7,
    p.users as group_size,
    -- 按用户在组内的实际收入比例分配预测收入
    CASE 
      WHEN p.actual_revenue_d7 > 0 
      THEN p.predicted_revenue_d7 * (o.revenue_d7 / p.actual_revenue_d7)
      ELSE p.predicted_revenue_d7 / p.users  -- 如果组内总收入为0，平均分配
    END as user_predicted_revenue_d7
  FROM lw_20250820_aos_gpir_uid_revenue_view3_by_j o
  INNER JOIN lw_20250827_gbt_predictions_by_j p
    ON o.country_group = p.country_group
    AND o.payment_count_24h = p.payment_count_24h
    AND ROUND(o.revenue_24h) = p.revenue_24h
    AND ROUND(o.max_payment_24h) = p.max_payment_24h
    AND o.payment_count_48h = p.payment_count_48h
    AND ROUND(o.revenue_48h) = p.revenue_48h
    AND ROUND(o.max_payment_48h) = p.max_payment_48h
    AND o.payment_count_72h = p.payment_count_72h
    AND ROUND(o.revenue_72h) = p.revenue_72h
    AND ROUND(o.max_payment_72h) = p.max_payment_72h
    AND ROUND(o.revenue_d7) = p.actual_revenue_d7
  WHERE o.install_day BETWEEN 20250101 AND 20250615
)
SELECT 
  *,
  (user_predicted_revenue_d7 - actual_revenue_d7) as user_prediction_error,
  abs(user_predicted_revenue_d7 - actual_revenue_d7) as user_abs_error,
  CASE 
    WHEN actual_revenue_d7 > 0 
    THEN abs(user_predicted_revenue_d7 - actual_revenue_d7) / actual_revenue_d7 * 100
    ELSE NULL 
  END as user_percentage_error
FROM user_predictions;

-- 优化表
ANALYZE TABLE lw_20250827_user_level_predictions_by_j COMPUTE STATISTICS;

In [0]:
%sql
select * from lw_20250827_user_level_predictions_by_j
limit 100
;

汇总按照国家计算MAPE

In [0]:
%sql
-- 按天计算，再计算MAPE
WITH daily_country_revenue AS (
  -- 第一步：按国家和日期分组，汇总收入
  SELECT 
    country_group,
    install_day,
    SUM(user_predicted_revenue_d7) as daily_predicted_revenue,
    SUM(actual_revenue_d7) as daily_actual_revenue
  FROM lw_20250827_user_level_predictions_by_j
  GROUP BY country_group, install_day
  HAVING SUM(actual_revenue_d7) > 0  -- 确保当天有实际收入
),

daily_country_mape AS (
  -- 第二步：基于汇总收入计算每天的APE
  SELECT 
    country_group,
    install_day,
    ABS(daily_predicted_revenue - daily_actual_revenue) / daily_actual_revenue * 100 as daily_ape
  FROM daily_country_revenue
)

-- 第三步：计算每个国家的MAPE（各天APE的平均）
SELECT 
  country_group,
  ROUND(AVG(daily_ape), 4) as mape
FROM daily_country_mape
GROUP BY country_group
ORDER BY mape ASC;

In [0]:
%sql
-- 按周计算MAPE
WITH weekly_country_revenue AS (
  -- 第一步：按国家和周分组，汇总收入
  SELECT 
    country_group,
    -- 计算周数：基于install_day计算是第几周
    CONCAT(
      SUBSTR(install_day, 1, 4), 
      '-W', 
      LPAD(WEEKOFYEAR(TO_DATE(install_day, 'yyyyMMdd')), 2, '0')
    ) as install_week,
    SUM(user_predicted_revenue_d7) as weekly_predicted_revenue,
    SUM(actual_revenue_d7) as weekly_actual_revenue
  FROM lw_20250827_user_level_predictions_by_j
  GROUP BY 
    country_group,
    CONCAT(
      SUBSTR(install_day, 1, 4), 
      '-W', 
      LPAD(WEEKOFYEAR(TO_DATE(install_day, 'yyyyMMdd')), 2, '0')
    )
  HAVING SUM(actual_revenue_d7) > 0  -- 确保当周有实际收入
),

weekly_country_mape AS (
  -- 第二步：基于汇总收入计算每周的APE
  SELECT 
    country_group,
    install_week,
    ABS(weekly_predicted_revenue - weekly_actual_revenue) / weekly_actual_revenue * 100 as weekly_ape
  FROM weekly_country_revenue
)

-- 第三步：计算每个国家的MAPE（各周APE的平均）
SELECT 
  country_group,
  ROUND(AVG(weekly_ape), 4) as weekly_mape
FROM weekly_country_mape
GROUP BY country_group
ORDER BY weekly_mape ASC;

分媒体 按天、按周

In [0]:
%sql
-- 简化版：只返回国家、媒体和MAPE
WITH daily_country_media_revenue AS (
  SELECT 
    p.country_group,
    p.install_day,
    o.mediasource as mediasource_corrected,
    SUM(p.user_predicted_revenue_d7) as daily_predicted_revenue,
    SUM(p.actual_revenue_d7) as daily_actual_revenue
  FROM `data_science`.`default`.lw_20250827_user_level_predictions_by_j p
  LEFT JOIN `data_science`.`default`.lw_20250820_aos_gpir_uid_revenue_view3_by_j o
    ON p.uid = o.uid
  WHERE o.mediasource IS NOT NULL
  GROUP BY p.country_group, p.install_day, o.mediasource
  HAVING SUM(p.actual_revenue_d7) > 0
),daily_country_media_mape AS (
  SELECT 
    country_group,
    mediasource_corrected,
    install_day,
    ABS(daily_predicted_revenue - daily_actual_revenue) / daily_actual_revenue * 100 as daily_ape
  FROM daily_country_media_revenue
)SELECT 
  country_group,
  mediasource_corrected,
  ROUND(AVG(daily_ape), 4) as mape
FROM daily_country_media_mape
GROUP BY country_group, mediasource_corrected
ORDER BY country_group, mediasource_corrected;

In [0]:
%sql
-- 按周计算国家+媒体维度的MAPE
WITH weekly_country_media_revenue AS (
  SELECT 
    p.country_group,
    -- 计算周数：基于install_day计算是第几周
    CONCAT(
      SUBSTR(p.install_day, 1, 4), 
      '-W', 
      LPAD(WEEKOFYEAR(TO_DATE(p.install_day, 'yyyyMMdd')), 2, '0')
    ) as install_week,
    o.mediasource as mediasource_corrected,
    SUM(p.user_predicted_revenue_d7) as weekly_predicted_revenue,
    SUM(p.actual_revenue_d7) as weekly_actual_revenue
  FROM `data_science`.`default`.lw_20250827_user_level_predictions_by_j p
  LEFT JOIN `data_science`.`default`.lw_20250820_aos_gpir_uid_revenue_view3_by_j o
    ON p.uid = o.uid
  WHERE o.mediasource IS NOT NULL
  GROUP BY 
    p.country_group, 
    CONCAT(
      SUBSTR(p.install_day, 1, 4), 
      '-W', 
      LPAD(WEEKOFYEAR(TO_DATE(p.install_day, 'yyyyMMdd')), 2, '0')
    ),
    o.mediasource
  HAVING SUM(p.actual_revenue_d7) > 0
),

weekly_country_media_mape AS (
  SELECT 
    country_group,
    mediasource_corrected,
    install_week,
    ABS(weekly_predicted_revenue - weekly_actual_revenue) / weekly_actual_revenue * 100 as weekly_ape
  FROM weekly_country_media_revenue
)

SELECT 
  country_group,
  mediasource_corrected,
  ROUND(AVG(weekly_ape), 4) as weekly_mape
FROM weekly_country_media_mape
GROUP BY country_group, mediasource_corrected
ORDER BY country_group, mediasource_corrected;