In [2]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [3]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [4]:
# sql 추출 및 데이터 프레임 변환

def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 클래식 리텐션

In [5]:
sql = """
SELECT
 period,
 FIRST_VALUE(cohort_retained) OVER(ORDER BY period) AS cohort_size,
 cohort_retained,
 cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(ORDER BY period) pct_retained
FROM(
  SELECT
    DATE_DIFF(PARSE_DATE("%Y%m%d", users.event_date),PARSE_DATE("%Y%m%d",user.event_date),DAY) AS period,
    COUNT(DISTINCT user.user_pseudo_id) AS cohort_retained
  FROM(
    SELECT
      user_pseudo_id,
      MIN(event_date) AS event_date
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    GROUP BY user_pseudo_id
  ) user
  INNER JOIN `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` users
  ON user.user_pseudo_id = users.user_pseudo_id
  GROUP BY period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,period,cohort_size,cohort_retained,pct_retained
0,0,270154,270154,1.000000
1,1,270154,12538,0.046411
2,2,270154,4654,0.017227
3,3,270154,3226,0.011941
4,4,270154,2415,0.008939
...,...,...,...,...
86,86,270154,14,0.000052
87,87,270154,12,0.000044
88,88,270154,3,0.000011
89,89,270154,2,0.000007


### 클래식 리텐션을 활용한 코호트 분석

In [6]:
sql = """
SELECT
  category,
  period,
  FIRST_VALUE(cohort_retained) OVER(PARTITION BY category ORDER BY period) AS cohort_size,
  cohort_retained,
  cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY category ORDER BY period) AS pct_retained
FROM(
    SELECT
    user.category,
    DATE_DIFF(PARSE_DATE("%Y%m%d",users.event_date), PARSE_DATE("%Y%m%d", user.event_date), DAY) AS period,
    COUNT(DISTINCT user.user_pseudo_id) AS cohort_retained
  FROM(
    SELECT
      user_pseudo_id,
      device.category,
      MIN(event_date) AS event_date
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    GROUP BY user_pseudo_id, device.category
  ) user
  INNER JOIN `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` users
  ON user.user_pseudo_id = users.user_pseudo_id AND user.category = users.device.category
  GROUP BY user.category, period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,period,cohort_size,cohort_retained,pct_retained
0,desktop,0,158917,158917,1.000000
1,desktop,1,158917,7429,0.046748
2,desktop,2,158917,2791,0.017563
3,desktop,3,158917,1891,0.011899
4,desktop,4,158917,1406,0.008847
...,...,...,...,...,...
214,tablet,39,6250,1,0.000160
215,tablet,40,6250,1,0.000160
216,tablet,41,6250,2,0.000320
217,tablet,43,6250,1,0.000160


### 롤링 리텐션

In [7]:
sql = """
SELECT
  period,
  FIRST_VALUE(cohort_retained) OVER(ORDER BY period) AS cohort_size,
  cohort_retained,
  cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(ORDER BY period) AS pct_retained
FROM(
  SELECT
    DATE_DIFF(event_date,min_event_date,DAY) AS period,
    COUNT(DISTINCT user_pseudo_id) AS cohort_retained
  FROM(
    SELECT
      user_pseudo_id,
      MIN(PARSE_DATE("%Y%m%d",event_date)) AS min_event_date,
      GENERATE_DATE_ARRAY(MIN(PARSE_DATE("%Y%m%d",event_date)), MAX(PARSE_DATE("%Y%m%d",event_date)), INTERVAL 1 DAY) AS event_date
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    GROUP BY user_pseudo_id
  ), UNNEST(event_date) AS event_date
  GROUP BY period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,period,cohort_size,cohort_retained,pct_retained
0,0,270154,270154,1.000000
1,1,270154,29713,0.109985
2,2,270154,20896,0.077348
3,3,270154,18026,0.066725
4,4,270154,16097,0.059585
...,...,...,...,...
86,86,270154,28,0.000104
87,87,270154,18,0.000067
88,88,270154,7,0.000026
89,89,270154,5,0.000019


### 롤링 리텐션을 활용한 코호트 분석

In [8]:
sql = """
SELECT
  category,
  period,
  FIRST_VALUE(cohort_retained) OVER(PARTITION BY category ORDER BY period) AS cohort_size,
  cohort_retained,
  cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY category ORDER BY period) AS pct_retained
FROM(
  SELECT
    category,
    DATE_DIFF(event_date,min_event_date,DAY) AS period,
    COUNT(DISTINCT user_pseudo_id) AS cohort_retained
  FROM(
    SELECT
      user_pseudo_id,
      device.category,
      MIN(PARSE_DATE("%Y%m%d",event_date)) AS min_event_date,
      GENERATE_DATE_ARRAY(MIN(PARSE_DATE("%Y%m%d",event_date)), MAX(PARSE_DATE("%Y%m%d",event_date)), INTERVAL 1 DAY) AS event_date
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    GROUP BY device.category, user_pseudo_id
  ), UNNEST(event_date) AS event_date
  GROUP BY category, period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,period,cohort_size,cohort_retained,pct_retained
0,desktop,0,158917,158917,1.000000
1,desktop,1,158917,16748,0.105388
2,desktop,2,158917,11423,0.071880
3,desktop,3,158917,9586,0.060321
4,desktop,4,158917,8423,0.053003
...,...,...,...,...,...
229,tablet,47,6250,1,0.000160
230,tablet,48,6250,1,0.000160
231,tablet,49,6250,1,0.000160
232,tablet,50,6250,1,0.000160


### 범위 리텐션

In [9]:
sql = """
WITH users AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201114" AND "20201224"  
)

SELECT
  period,
  FIRST_VALUE(cohort_retained) OVER(ORDER BY period) AS cohort_size,
  cohort_retained,
  cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(ORDER BY period) AS pct_retained
FROM(
  SELECT
    CASE
      WHEN users.event_date <= "20201120" THEN "0주차"
      WHEN users.event_date <= "20201127" THEN "1주차"
      WHEN users.event_date <= "20201203" THEN "2주차"
      WHEN users.event_date <= "20201210" THEN "3주차"
      WHEN users.event_date <= "20201217" THEN "4주차"
      ELSE "5주차"
    END AS period,
    COUNT(DISTINCT user_pseudo_id) AS cohort_retained
  FROM users
  WHERE EXISTS(
    SELECT
      user_pseudo_id
    FROM(
      SELECT
        user_pseudo_id,
        MIN(event_date) AS min_event_date
      FROM users
      GROUP BY user_pseudo_id
      HAVING min_event_date <= "20201120"
    ) user
    WHERE user.user_pseudo_id = users.user_pseudo_id
  )
  GROUP BY period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,period,cohort_size,cohort_retained,pct_retained
0,0주차,19021,19021,1.0
1,1주차,19021,1356,0.07129
2,2주차,19021,702,0.036907
3,3주차,19021,669,0.035172
4,4주차,19021,515,0.027075
5,5주차,19021,242,0.012723


### 범위 리텐션을 활용한 코호트 분석

In [10]:
sql = """
WITH users AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201114" AND "20201224"  
)

SELECT
  week_period,
  CASE
    WHEN week_period = "11월 14일 - 11월 20일" THEN CONCAT(period,"주차")
    WHEN week_period = "11월 21일 - 11월 27일" THEN CONCAT(CAST(period AS INT64) - 1,"주차")
    WHEN week_period = "11월 28일 - 12월 3일" THEN CONCAT(CAST(period AS INT64) - 2,"주차")
    WHEN week_period = "12월 4일 - 12월 10일" THEN CONCAT(CAST(period AS INT64) - 3,"주차")
    WHEN week_period = "12월 11일 - 12월 17일" THEN CONCAT(CAST(period AS INT64) - 4,"주차")
    ELSE CONCAT(0,"주차")
  END AS period,
  FIRST_VALUE(cohort_retained) OVER(PARTITION BY week_period ORDER BY period) AS cohort_size,
  cohort_retained,
  cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY week_period ORDER BY period) AS pct_retained
FROM(
    SELECT
      week_period,
      CASE
        WHEN users.event_date <= "20201120" THEN "0"
        WHEN users.event_date <= "20201127" THEN "1"
        WHEN users.event_date <= "20201203" THEN "2"
        WHEN users.event_date <= "20201210" THEN "3"
        WHEN users.event_date <= "20201217" THEN "4"
        ELSE "5"
      END AS period,
      COUNT(DISTINCT user.user_pseudo_id) AS cohort_retained
  FROM(
    SELECT
      min_event_date,
      user_pseudo_id,
      CASE
        WHEN min_event_date <= "20201120" THEN "11월 14일 - 11월 20일"
        WHEN min_event_date <= "20201127" THEN "11월 21일 - 11월 27일"
        WHEN min_event_date <= "20201203" THEN "11월 28일 - 12월 3일"
        WHEN min_event_date <= "20201210" THEN "12월 4일 - 12월 10일"
        WHEN min_event_date <= "20201217" THEN "12월 11일 - 12월 17일"
        ELSE "12월 18일 - 12월 24일"
      END AS week_period
    FROM(
      SELECT
        user_pseudo_id,
        MIN(event_date) AS min_event_date
      FROM users
      GROUP BY user_pseudo_id
    )
  ) user
  INNER JOIN users
  ON user.user_pseudo_id = users.user_pseudo_id
  GROUP BY week_period, period
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,week_period,period,cohort_size,cohort_retained,pct_retained
0,11월 14일 - 11월 20일,0주차,19021,19021,1.0
1,11월 14일 - 11월 20일,1주차,19021,1356,0.07129
2,11월 14일 - 11월 20일,2주차,19021,702,0.036907
3,11월 14일 - 11월 20일,3주차,19021,669,0.035172
4,11월 14일 - 11월 20일,4주차,19021,515,0.027075
5,11월 14일 - 11월 20일,5주차,19021,242,0.012723
6,11월 21일 - 11월 27일,0주차,20029,20029,1.0
7,11월 21일 - 11월 27일,1주차,20029,1150,0.057417
8,11월 21일 - 11월 27일,2주차,20029,623,0.031105
9,11월 21일 - 11월 27일,3주차,20029,448,0.022368
