In [1]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 시간별 국가 사용자

In [4]:
sql = """
SELECT
  event_date,
  geo.country,
  COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY event_date, geo.country
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,event_date,country,user_count
0,20201202,United States,1778
1,20201202,India,368
2,20201202,France,80
3,20201202,Egypt,11
4,20201202,Japan,51
...,...,...,...
2511,20201130,Bahamas,1
2512,20201130,Jamaica,1
2513,20201130,Myanmar (Burma),1
2514,20201130,Costa Rica,1


### 사용자

In [5]:
sql = """
SELECT
  geo.country,
  COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY geo.country
ORDER BY user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,user_count
0,United States,33572
1,India,7118
2,Canada,5788
3,United Kingdom,2402
4,France,1528
...,...,...
104,Malta,15
105,Kosovo,14
106,Bahamas,13
107,Macao,7


### 새사용자

In [27]:
sql = """
SELECT
  geo.country,
  COUNT(DISTINCT user_pseudo_id) AS new_user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "first_visit"
GROUP BY geo.country
ORDER BY new_user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,new_user_count
0,United States,30478
1,India,6402
2,Canada,5214
3,United Kingdom,2187
4,France,1387
...,...,...
104,Kosovo,14
105,Bahamas,13
106,Luxembourg,13
107,Bahrain,6


### 참여 세션수

In [7]:
sql = """
SELECT
  geo.country,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged"),0) + CAST(COALESCE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = "session_engaged"),"0") AS INT64) = 1
GROUP BY geo.country
ORDER BY engaged_session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,engaged_session_count
0,United States,38783
1,India,8256
2,Canada,6640
3,United Kingdom,2748
4,France,1803
...,...,...
104,Paraguay,17
105,Kosovo,14
106,Bahamas,13
107,Macao,12


### 참여율

In [8]:
sql = """
WITH session AS(
  SELECT
    geo.country,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY geo.country
),
engaged_session AS(
  SELECT
    geo.country,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged"),0) + CAST(COALESCE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = "session_engaged"),"0") AS INT64) = 1
  GROUP BY geo.country
)
SELECT
  session.country,
  ROUND(engaged_session.engaged_session_count / session.session_count,2) AS engagement_ratio
FROM session
LEFT JOIN engaged_session
ON session.country = engaged_session.country
ORDER BY engagement_ratio DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,engagement_ratio
0,Iceland,0.96
1,Malta,0.95
2,Ghana,0.94
3,Costa Rica,0.93
4,Bahamas,0.93
...,...,...
104,Lithuania,0.76
105,Palestine,0.75
106,Bosnia & Herzegovina,0.75
107,Paraguay,0.74


### 사용자당 참여 세션수

In [9]:
sql = """
WITH session AS(
  SELECT
    geo.country,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY geo.country
),
engaged_session AS(
  SELECT
    geo.country,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged"),0) + CAST(COALESCE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = "session_engaged"),"0") AS INT64) = 1
  GROUP BY geo.country
),
user AS(
  SELECT
    geo.country,
    COUNT(DISTINCT user_pseudo_id) AS user_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY geo.country
)
SELECT
  user.country,
  ROUND(engaged_session.engaged_session_count / user.user_count,2) AS user_per_engaged_session_count
FROM user
LEFT JOIN engaged_session
ON user.country = engaged_session.country
ORDER BY user_per_engaged_session_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,user_per_engaged_session_count
0,Macao,1.71
1,Cambodia,1.55
2,Mongolia,1.47
3,Honduras,1.40
4,El Salvador,1.32
...,...,...
104,Costa Rica,1.00
105,Bosnia & Herzegovina,1.00
106,Kosovo,1.00
107,Guatemala,1.00


### 평균 참여 시간

In [10]:
sql = """
SELECT
  geo.country,
  SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) AS engagement_time,
  COUNT(DISTINCT user_pseudo_id) AS user_count,
  SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id) AS user_engagement_time
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY geo.country
ORDER BY user_engagement_time DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,engagement_time,user_count,user_engagement_time
0,Macao,2563.0,7,366.142857
1,Albania,6305.0,24,262.708333
2,El Salvador,6388.0,25,255.520000
3,Panama,4416.0,18,245.333333
4,Kuwait,10539.0,50,210.780000
...,...,...,...,...
104,Bolivia,827.0,21,39.380952
105,Bahrain,250.0,7,35.714286
106,Estonia,827.0,28,29.535714
107,Kosovo,410.0,14,29.285714


### 이벤트 수

In [11]:
sql = """
SELECT
  geo.country,
  COUNT(event_name) AS event_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY geo.country
ORDER BY event_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,event_count
0,United States,642489
1,India,131268
2,Canada,111232
3,United Kingdom,43383
4,France,28895
...,...,...
104,Ghana,176
105,Oman,168
106,Paraguay,154
107,Kosovo,129


### 전환

In [12]:
sql = """
SELECT
  geo.country,
  COUNT(event_name) AS conversion_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name IN ("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
GROUP BY geo.country
ORDER BY conversion_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,conversion_count
0,United States,108847
1,India,21915
2,Canada,18663
3,United Kingdom,7267
4,France,4769
...,...,...
104,Bosnia & Herzegovina,26
105,Paraguay,23
106,Oman,20
107,Kosovo,19


### 총 수익

In [13]:
sql = """
SELECT
  geo.country,
  TRUNC(CAST(SUM(COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value"),0)) AS NUMERIC),2) AS value
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY geo.country
ORDER BY value DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,value
0,United States,68071.290000000
1,India,13861.670000000
2,Canada,11894.330000000
3,United Kingdom,3787.760000000
4,Spain,3561.300000000
...,...,...
104,Bahamas,0E-9
105,Kosovo,0E-9
106,Albania,0E-9
107,Luxembourg,0E-9


### 보고서

In [26]:
sql = """
WITH user AS(
  SELECT
    geo.country,
    COUNT(DISTINCT user_pseudo_id) AS user_count,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count,
    SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) AS engagement_time,
    SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id) AS user_engagement_time,
    COUNT(event_name) AS event_count,
    TRUNC(CAST(SUM(COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value"),0) + COALESCE((SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value"),0)) AS NUMERIC),2) AS value
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY geo.country
),
new_user AS(
  SELECT
    geo.country,
    COUNT(DISTINCT user_pseudo_id) AS new_user_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "first_visit"
  GROUP BY geo.country
),
engaged_session AS(
  SELECT
    geo.country,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND COALESCE((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged"),0) + CAST(COALESCE((SELECT value.string_value FROM UNNEST(event_params) WHERE key = "session_engaged"),"0") AS INT64) = 1
  GROUP BY geo.country
),
conversion AS(
  SELECT
    geo.country,
    COUNT(event_name) AS conversion_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name IN ("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
  GROUP BY geo.country
)

SELECT
  user.country,
  user.user_count,
  new_user.new_user_count,
  user.session_count,
  engaged_session.engaged_session_count,
  ROUND(engaged_session.engaged_session_count / user.session_count,2) AS engagement_ratio,
  ROUND(engaged_session.engaged_session_count / user.user_count,2) AS user_per_engagement_session_count,
  user.user_engagement_time,
  user.event_count,
  conversion.conversion_count,
  user.value
FROM user
LEFT JOIN new_user
ON user.country = new_user.country
LEFT JOIN engaged_session
ON user.country = engaged_session.country
LEFT JOIN conversion
ON user.country = conversion.country
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,country,user_count,new_user_count,session_count,engaged_session_count,engagement_ratio,user_per_engagement_session_count,user_engagement_time,event_count,conversion_count,value
0,Romania,198,179,257,224,0.87,1.13,97.747475,3196,556,186.560000000
1,United States,33572,30478,45807,38783,0.85,1.16,121.353747,642489,108847,68071.290000000
2,India,7118,6402,9711,8256,0.85,1.16,117.584434,131268,21915,13861.670000000
3,Australia,726,654,1004,839,0.84,1.16,112.705234,13259,2175,1052.380000000
4,Brazil,763,691,1066,899,0.84,1.18,113.512451,14367,2385,1673.460000000
...,...,...,...,...,...,...,...,...,...,...,...
104,Albania,24,22,34,29,0.85,1.21,262.708333,479,94,0E-9
105,Kosovo,14,14,16,14,0.88,1.00,29.285714,129,19,0E-9
106,El Salvador,25,20,42,33,0.79,1.32,255.520000,910,162,52.800000000
107,Cyprus,27,26,37,31,0.84,1.15,73.962963,400,61,14.520000000
