In [2]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [3]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [4]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 시간별 세션 사용

In [5]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  user.event_date,
  CONCAT(user.traffic_source.source,"/",user.traffic_source.medium) AS source_medium,
  COUNT(DISTINCT user.user_pseudo_id) AS new_user_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY user.event_date, source_medium
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,event_date,source_medium,new_user_count
0,20201122,<Other>/referral,259
1,20201122,shop.googlemerchandisestore.com/referral,220
2,20201122,(direct)/(none),608
3,20201122,(data deleted)/(data deleted),153
4,20201122,<Other>/<Other>,365
...,...,...,...
238,20201205,<Other>/(data deleted),6
239,20201201,<Other>/(data deleted),5
240,20201203,<Other>/(data deleted),3
241,20201114,<Other>/(data deleted),2


### 사용자

In [6]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT user_pseudo_id) AS user_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY user_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,user_count
0,google/organic,29148
1,(direct)/(none),21684
2,<Other>/<Other>,14020
3,<Other>/referral,9424
4,shop.googlemerchandisestore.com/referral,7966
5,(data deleted)/(data deleted),6042
6,google/cpc,4285
7,<Other>/organic,2748
8,<Other>/(data deleted),137


### 세션수

In [7]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,session_count
0,google/organic,31748
1,(direct)/(none),23799
2,<Other>/<Other>,14327
3,<Other>/referral,9933
4,shop.googlemerchandisestore.com/referral,8745
5,(data deleted)/(data deleted),7229
6,google/cpc,4314
7,<Other>/organic,2762
8,<Other>/(data deleted),140


### 참여 세션수

In [8]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
GROUP BY source_medium
ORDER BY engaged_session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engaged_session_count
0,google/organic,28057
1,(direct)/(none),20104
2,<Other>/<Other>,13404
3,<Other>/referral,8172
4,shop.googlemerchandisestore.com/referral,6046
5,google/cpc,4070
6,(data deleted)/(data deleted),3560
7,<Other>/organic,2501
8,<Other>/(data deleted),67


### 세션당 평균 참여 시간

In [9]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  FLOOR(SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id)) AS engagement_time
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY engagement_time DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engagement_time
0,(data deleted)/(data deleted),133.0
1,shop.googlemerchandisestore.com/referral,101.0
2,<Other>/(data deleted),95.0
3,(direct)/(none),93.0
4,<Other>/referral,91.0
5,google/organic,90.0
6,<Other>/<Other>,84.0
7,google/cpc,81.0
8,<Other>/organic,80.0


### 사용자당 참여 세션수

In [10]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  engaged_session.source_medium,
  engaged_session.engaged_session_count,
  session.user_count,
  ROUND(engaged_session.engaged_session_count / session.user_count,2) AS user_per_engagement_count
FROM(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT user_pseudo_id) AS user_count
  FROM user
  WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
  GROUP BY source_medium
) session
INNER JOIN(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM user
  WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
  AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
  GROUP BY source_medium
) engaged_session
ON session.source_medium = engaged_session.source_medium
ORDER BY user_per_engagement_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engaged_session_count,user_count,user_per_engagement_count
0,<Other>/<Other>,13404,14020,0.96
1,google/organic,28057,29148,0.96
2,google/cpc,4070,4285,0.95
3,(direct)/(none),20104,21684,0.93
4,<Other>/organic,2501,2748,0.91
5,<Other>/referral,8172,9424,0.87
6,shop.googlemerchandisestore.com/referral,6046,7966,0.76
7,(data deleted)/(data deleted),3560,6042,0.59
8,<Other>/(data deleted),67,137,0.49


### 세션당 이벤트

In [11]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count,
  COUNT(event_name) AS event_count,
  ROUND(COUNT(event_name) / COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))),2) AS session_per_event_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,session_count,event_count,session_per_event_count
0,google/organic,31748,445091,14.02
1,(direct)/(none),23799,323699,13.6
2,<Other>/<Other>,14327,199183,13.9
3,<Other>/referral,9933,138518,13.95
4,shop.googlemerchandisestore.com/referral,8745,123592,14.13
5,(data deleted)/(data deleted),7229,99523,13.77
6,google/cpc,4314,59259,13.74
7,<Other>/organic,2762,36982,13.39
8,<Other>/(data deleted),140,1793,12.81


### 참여율

In [12]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  engaged_session.source_medium,
  engaged_session.engaged_session_count,
  session.session_count,
  ROUND(engaged_session.engaged_session_count / session.session_count * 100,2) AS engagement_ratio
FROM(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count,
  FROM user
  WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
  GROUP BY source_medium
) session
INNER JOIN(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM user
  WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
  AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
  GROUP BY source_medium
) engaged_session
ON session.source_medium = engaged_session.source_medium
ORDER BY engagement_ratio DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engaged_session_count,session_count,engagement_ratio
0,google/cpc,4070,4314,94.34
1,<Other>/<Other>,13404,14327,93.56
2,<Other>/organic,2501,2762,90.55
3,google/organic,28057,31748,88.37
4,(direct)/(none),20104,23799,84.47
5,<Other>/referral,8172,9933,82.27
6,shop.googlemerchandisestore.com/referral,6046,8745,69.14
7,(data deleted)/(data deleted),3560,7229,49.25
8,<Other>/(data deleted),67,140,47.86


### 이벤트수

In [13]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(event_name) AS event_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY event_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,event_count
0,google/organic,445091
1,(direct)/(none),323699
2,<Other>/<Other>,199183
3,<Other>/referral,138518
4,shop.googlemerchandisestore.com/referral,123592
5,(data deleted)/(data deleted),99523
6,google/cpc,59259
7,<Other>/organic,36982
8,<Other>/(data deleted),1793


### 전환

In [14]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(event_name) AS conversion_count
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
AND event_name IN("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
GROUP BY source_medium
ORDER BY conversion_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,conversion_count
0,google/organic,77231
1,(direct)/(none),54768
2,<Other>/<Other>,35901
3,<Other>/referral,23082
4,shop.googlemerchandisestore.com/referral,19138
5,(data deleted)/(data deleted),13756
6,google/cpc,10711
7,<Other>/organic,6496
8,<Other>/(data deleted),199


### 총수익

In [15]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
)

SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  ROUND(SUM(COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value"),0)),2) AS value_sum
FROM user
WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
GROUP BY source_medium
ORDER BY value_sum DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,value_sum
0,google/organic,38814.49
1,(direct)/(none),31399.18
2,(data deleted)/(data deleted),20503.57
3,shop.googlemerchandisestore.com/referral,17955.49
4,<Other>/<Other>,16395.91
5,<Other>/referral,15983.44
6,google/cpc,3776.92
7,<Other>/organic,2508.22
8,<Other>/(data deleted),170.0


### 보고서

In [16]:
sql = """
WITH user AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
),
session_start AS(
  SELECT
    *
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "session_start"
),

event_user AS(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(event_name) AS conversion_count
  FROM user
  WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
  AND event_name IN("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
  GROUP BY source_medium
  ORDER BY conversion_count DESC
)

SELECT
  user_engaged_session.source_medium,
  * EXCEPT(source_medium)
FROM(
  SELECT
    engaged_session.source_medium,
    session.user_count,
    engaged_session.engaged_session_count,
    session.session_count,
    ROUND(engaged_session.engaged_session_count / session.session_count * 100,2) AS engagement_ratio,
    ROUND(engaged_session.engaged_session_count / session.user_count,2) AS user_per_engagement_count,
    session.session_per_event_count,
    session.engagement_time,
    session.event_count,
    session.value_sum
  FROM(
    SELECT
      CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
      COUNT(DISTINCT user_pseudo_id) AS user_count,
      COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count,
      FLOOR(SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id)) AS engagement_time,
      COUNT(event_name) AS event_count,
      ROUND(COUNT(event_name) / COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))),2) AS session_per_event_count,
      ROUND(SUM(COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value"),0)),2) AS value_sum
    FROM user
    WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
    GROUP BY source_medium
  ) session
  INNER JOIN(
    SELECT
      CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
      COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
    FROM user
    WHERE EXISTS(SELECT * FROM session_start WHERE user.user_pseudo_id = session_start.user_pseudo_id)
    AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
    GROUP BY source_medium
  ) engaged_session
  ON session.source_medium = engaged_session.source_medium
  ) user_engaged_session
INNER JOIN event_user
ON user_engaged_session.source_medium = event_user.source_medium
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,user_count,engaged_session_count,session_count,engagement_ratio,user_per_engagement_count,session_per_event_count,engagement_time,event_count,value_sum,conversion_count
0,(direct)/(none),21684,20104,23799,84.47,0.93,13.6,93.0,323699,31399.18,54768
1,google/organic,29148,28057,31748,88.37,0.96,14.02,90.0,445091,38814.49,77231
2,<Other>/referral,9424,8172,9933,82.27,0.87,13.95,91.0,138518,15983.44,23082
3,shop.googlemerchandisestore.com/referral,7966,6046,8745,69.14,0.76,14.13,101.0,123592,17955.49,19138
4,(data deleted)/(data deleted),6042,3560,7229,49.25,0.59,13.77,133.0,99523,20503.57,13756
5,<Other>/<Other>,14020,13404,14327,93.56,0.96,13.9,84.0,199183,16395.91,35901
6,google/cpc,4285,4070,4314,94.34,0.95,13.74,81.0,59259,3776.92,10711
7,<Other>/organic,2748,2501,2762,90.55,0.91,13.39,80.0,36982,2508.22,6496
8,<Other>/(data deleted),137,67,140,47.86,0.49,12.81,95.0,1793,170.0,199
