In [2]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [3]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [4]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 시간별 신규 사용자

In [5]:
sql = """
SELECT
  event_date,
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT user_pseudo_id) AS new_user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "first_visit"
GROUP BY event_date, source_medium
ORDER BY new_user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,event_date,source_medium,new_user_count
0,20201203,google/organic,1289
1,20201201,google/organic,1164
2,20201204,google/organic,1156
3,20201202,google/organic,1139
4,20201130,google/organic,1097
...,...,...,...
188,20201115,<Other>/organic,46
189,20201113,(data deleted)/(data deleted),2
190,20201111,(data deleted)/(data deleted),2
191,20201201,(data deleted)/(data deleted),1


### 새 사용자

In [7]:
sql = """
SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT user_pseudo_id) AS new_user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "first_visit"
GROUP BY source_medium
ORDER BY new_user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,new_user_count
0,google/organic,24441
1,(direct)/(none),16541
2,<Other>/<Other>,12491
3,<Other>/referral,6363
4,google/cpc,3835
5,shop.googlemerchandisestore.com/referral,3434
6,<Other>/organic,2245
7,(data deleted)/(data deleted),6


### 참여 세션수

In [8]:
sql = """
SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
GROUP BY source_medium
ORDER BY engaged_session_count DESC"""

df = sql_to_dataframe(sql)
df


Unnamed: 0,source_medium,engaged_session_count
0,google/organic,28057
1,(direct)/(none),20104
2,<Other>/<Other>,13404
3,<Other>/referral,8172
4,shop.googlemerchandisestore.com/referral,6046
5,google/cpc,4070
6,(data deleted)/(data deleted),3560
7,<Other>/organic,2501
8,<Other>/(data deleted),67


### 세션수

In [9]:
sql = """
SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY source_medium
ORDER BY session_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,session_count
0,google/organic,31962
1,(direct)/(none),24031
2,<Other>/<Other>,14379
3,<Other>/referral,10045
4,shop.googlemerchandisestore.com/referral,8901
5,(data deleted)/(data deleted),7462
6,google/cpc,4326
7,<Other>/organic,2783
8,<Other>/(data deleted),141


### 참여율

In [10]:
sql = """
SELECT
  engaged_session.source_medium,
  engaged_session.engaged_session_count,
  session.session_count,
  ROUND(engaged_session.engaged_session_count / session.session_count * 100,2) AS engagement_ratio
FROM(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
  GROUP BY source_medium
    ) engaged_session
LEFT JOIN(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY source_medium
        ) session
ON engaged_session.source_medium = session.source_medium
ORDER BY engagement_ratio DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engaged_session_count,session_count,engagement_ratio
0,google/cpc,4070,4326,94.08
1,<Other>/<Other>,13404,14379,93.22
2,<Other>/organic,2501,2783,89.87
3,google/organic,28057,31962,87.78
4,(direct)/(none),20104,24031,83.66
5,<Other>/referral,8172,10045,81.35
6,shop.googlemerchandisestore.com/referral,6046,8901,67.92
7,(data deleted)/(data deleted),3560,7462,47.71
8,<Other>/(data deleted),67,141,47.52


### 사용자당 참여 세션수

In [13]:
sql = """
WITH new_user AS(
  SELECT
    CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
    COUNT(DISTINCT user_pseudo_id) AS new_user_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "first_visit"
  GROUP BY source_medium
  ORDER BY new_user_count DESC
  ),
  engaged_session AS(
    SELECT
      CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
      COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
    GROUP BY source_medium
    ORDER BY engaged_session_count DESC
      )

SELECT
  new_user.source_medium,
  new_user.new_user_count,
  engaged_session.engaged_session_count,
  engaged_session.engaged_session_count / new_user.new_user_count AS user_per_engagement_count
FROM new_user
INNER JOIN engaged_session
on new_user.source_medium = engaged_session.source_medium
ORDER BY user_per_engagement_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,new_user_count,engaged_session_count,user_per_engagement_count
0,(data deleted)/(data deleted),6,3560,593.333333
1,shop.googlemerchandisestore.com/referral,3434,6046,1.760629
2,<Other>/referral,6363,8172,1.2843
3,(direct)/(none),16541,20104,1.215404
4,google/organic,24441,28057,1.147948
5,<Other>/organic,2245,2501,1.114031
6,<Other>/<Other>,12491,13404,1.073093
7,google/cpc,3835,4070,1.061278


### 평균 참여 시간

In [14]:
sql = """
SELECT
  CONCAT(traffic_source.source,"/",traffic_source.medium) AS source_medium,
  FLOOR(SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id)) AS engagement_time,
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY source_medium
ORDER BY engagement_time DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,source_medium,engagement_time
0,(data deleted)/(data deleted),133.0
1,shop.googlemerchandisestore.com/referral,101.0
2,(direct)/(none),94.0
3,<Other>/(data deleted),94.0
4,google/organic,91.0
5,<Other>/referral,91.0
6,<Other>/<Other>,84.0
7,google/cpc,81.0
8,<Other>/organic,81.0


### 이벤트수

### 전환

In [12]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(event_name) AS conversion_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name IN("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
GROUP BY traffic_source.medium
ORDER BY conversion_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,conversion_count
0,organic,84331
1,(none),55044
2,referral,42627
3,<Other>,35997
4,(data deleted),14314
5,cpc,10715


### 총수익

In [13]:
sql = """
WITH value AS(
  SELECT
    traffic_source.medium,
    CASE
      WHEN (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value")
    END AS int_value,
    CASE
      WHEN (SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value")
    END AS float_value,
    CASE
      WHEN (SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value")
    END AS double_value
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
)

SELECT
  medium,
  ROUND(SUM(int_value + float_value + double_value),2) AS value_sum
FROM value
GROUP BY medium
ORDER BY value_sum DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,value_sum
0,organic,43040.41
1,referral,34536.38
2,(none),31701.06
3,(data deleted),20798.09
4,<Other>,16632.07
5,cpc,3776.92
