In [1]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 사용자 구매 여정

In [4]:
sql = """
WITH session AS(
  SELECT
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS session_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = 'session_start'
  GROUP BY user_pseudo_id
),
item AS(
  SELECT
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS item_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "view_item"
GROUP BY user_pseudo_id
),
cart AS(
  SELECT
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS cart_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "add_to_cart"
GROUP BY user_pseudo_id
),
checkout AS(
  SELECT
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS begin_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "begin_checkout"
GROUP BY user_pseudo_id
),
buy AS(
  SELECT
  user_pseudo_id,
  MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS buy_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND (event_name = "purchase" OR event_name = "in_app_purchase")
GROUP BY user_pseudo_id
)

SELECT
  COUNT(DISTINCT session.user_pseudo_id) AS session_count,
  COUNT(DISTINCT item.user_pseudo_id) AS item_count,
  COUNT(DISTINCT cart.user_pseudo_id) AS cart_count,
  COUNT(DISTINCT checkout.user_pseudo_id) AS begin_count,
  COUNT(DISTINCT buy.user_pseudo_id) AS buy_count
FROM session
LEFT JOIN item 
ON session.user_pseudo_id = item.user_pseudo_id
LEFT JOIN cart 
ON item.user_pseudo_id = cart.user_pseudo_id
LEFT JOIN checkout 
ON cart.user_pseudo_id = checkout.user_pseudo_id
LEFT JOIN buy 
ON checkout.user_pseudo_id = buy.user_pseudo_id
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,session_count,item_count,cart_count,begin_count,buy_count
0,75515,20865,3921,1945,817


### 세션 시작

In [5]:
sql = """
WITH session AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64))) AS session_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
  GROUP BY device.category, user_pseudo_id
)

SELECT
  category,
  COUNT(DISTINCT user_pseudo_id) AS session_count
FROM session
GROUP BY category
ORDER BY session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,session_count
0,desktop,44094
1,mobile,30305
2,tablet,1708


### 제품 보기

In [6]:
sql = """
WITH item AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64))) AS item_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "view_item"
  GROUP BY device.category, user_pseudo_id
)

SELECT
  category,
  COUNT(DISTINCT user_pseudo_id) AS item_count
FROM item
GROUP BY category
ORDER BY item_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,item_count
0,desktop,12428
1,mobile,8464
2,tablet,486


### 장바구니에 추가

In [7]:
sql = """
WITH cart AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64))) AS cart_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "add_to_cart"
  GROUP BY device.category, user_pseudo_id
)

SELECT
  category,
  COUNT(DISTINCT user_pseudo_id) AS cart_count
FROM cart
GROUP BY category
ORDER BY cart_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,cart_count
0,desktop,2291
1,mobile,1574
2,tablet,92


### 결제 시작

In [8]:
sql = """
WITH checkout AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64))) AS begin_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "begin_checkout"
  GROUP BY device.category, user_pseudo_id
)

SELECT
  category,
  COUNT(DISTINCT user_pseudo_id) AS begin_count
FROM checkout
GROUP BY category
ORDER BY begin_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,begin_count
0,desktop,2442
1,mobile,1710
2,tablet,94


### 구매

In [9]:
sql = """
WITH buy AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64))) AS buy_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND (event_name = "purchase" OR event_name = "in_app_purchase")
  GROUP BY device.category, user_pseudo_id
)

SELECT
  category,
  COUNT(DISTINCT user_pseudo_id) AS buy_count
FROM buy
GROUP BY category
ORDER BY buy_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,buy_count
0,desktop,951
1,mobile,704
2,tablet,34


### 보고서

In [10]:
sql = """
WITH session AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS session_time
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = 'session_start'
  GROUP BY device.category, user_pseudo_id
),
item AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS item_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "view_item"
GROUP BY device.category, user_pseudo_id
),
cart AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS cart_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "add_to_cart"
GROUP BY device.category, user_pseudo_id
),
checkout AS(
  SELECT
    device.category,
    user_pseudo_id,
    MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS begin_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "begin_checkout"
GROUP BY device.category, user_pseudo_id
),
buy AS(
  SELECT
  device.category,
  user_pseudo_id,
  MIN(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64)/ 1000000 AS INT64))) AS buy_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND (event_name = "purchase" OR event_name = "in_app_purchase")
GROUP BY device.category, user_pseudo_id
)

SELECT
  session.category,
  COUNT(DISTINCT session.user_pseudo_id) AS session_count,
  COUNT(DISTINCT item.user_pseudo_id) AS item_count,
  COUNT(DISTINCT cart.user_pseudo_id) AS cart_count,
  COUNT(DISTINCT checkout.user_pseudo_id) AS begin_count,
  COUNT(DISTINCT buy.user_pseudo_id) AS buy_count
FROM session
LEFT JOIN item 
ON session.user_pseudo_id = item.user_pseudo_id AND session.category = item.category
LEFT JOIN cart 
ON item.user_pseudo_id = cart.user_pseudo_id AND item.category = cart.category
LEFT JOIN checkout 
ON cart.user_pseudo_id = checkout.user_pseudo_id AND cart.category = checkout.category
LEFT JOIN buy 
ON checkout.user_pseudo_id = buy.user_pseudo_id AND checkout.category = buy.category
GROUP BY session.category
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,category,session_count,item_count,cart_count,begin_count,buy_count
0,desktop,44094,12233,2281,1123,461
1,mobile,30305,8333,1565,779,336
2,tablet,1708,474,90,41,18
