In [1]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 시간별 신규 사용자

In [4]:
sql = """
SELECT
  event_date,
  traffic_source.medium,
  COUNT(DISTINCT user_pseudo_id) AS new_user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "first_visit"
GROUP BY event_date,traffic_source.medium
ORDER BY new_user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,event_date,medium,new_user_count
0,20201203,organic,1410
1,20201204,organic,1264
2,20201201,organic,1256
3,20201202,organic,1252
4,20201130,organic,1207
...,...,...,...
134,20201115,cpc,83
135,20201113,(data deleted),2
136,20201111,(data deleted),2
137,20201201,(data deleted),1


### 새 사용자

In [5]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(DISTINCT user_pseudo_id) AS new_user_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name = "first_visit"
GROUP BY traffic_source.medium
ORDER BY new_user_count DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,new_user_count
0,organic,26686
1,(none),16541
2,<Other>,12491
3,referral,9797
4,cpc,3835
5,(data deleted),6


### 참여 세션수

In [6]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
GROUP BY traffic_source.medium
ORDER BY engaged_session_count DESC"""

df = sql_to_dataframe(sql)
df


Unnamed: 0,medium,engaged_session_count
0,organic,30558
1,(none),20104
2,referral,14218
3,<Other>,13404
4,cpc,4070
5,(data deleted),3627


### 세션수

In [7]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY traffic_source.medium
ORDER BY session_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,session_count
0,organic,34745
1,(none),24031
2,referral,18946
3,<Other>,14379
4,(data deleted),7603
5,cpc,4326


### 참여율

In [8]:
sql = """
SELECT
  engaged_session.medium,
  engaged_session.engaged_session_count,
  session.session_count,
  ROUND(engaged_session.engaged_session_count / session.session_count * 100,2) AS engagement_ratio
FROM(
  SELECT
    traffic_source.medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
  GROUP BY traffic_source.medium
    ) engaged_session
LEFT JOIN(
  SELECT
    traffic_source.medium,
    COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS session_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  GROUP BY traffic_source.medium
        ) session
ON engaged_session.medium = session.medium
ORDER BY engagement_ratio DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,engaged_session_count,session_count,engagement_ratio
0,cpc,4070,4326,94.08
1,<Other>,13404,14379,93.22
2,organic,30558,34745,87.95
3,(none),20104,24031,83.66
4,referral,14218,18946,75.04
5,(data deleted),3627,7603,47.7


### 사용자당 참여 세션수

In [9]:
sql = """
WITH new_user AS(
  SELECT
    traffic_source.medium,
    COUNT(DISTINCT user_pseudo_id) AS new_user_count
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
  AND event_name = "first_visit"
  GROUP BY traffic_source.medium
  ORDER BY new_user_count DESC
  ),
  engaged_session AS(
    SELECT
      traffic_source.medium,
      COUNT(DISTINCT CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id"))) AS engaged_session_count
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "session_engaged") = 1
    GROUP BY traffic_source.medium
    ORDER BY engaged_session_count DESC
      )

SELECT
  new_user.medium,
  new_user.new_user_count,
  engaged_session.engaged_session_count,
  engaged_session.engaged_session_count / new_user.new_user_count AS user_per_engagement_count
FROM new_user
INNER JOIN engaged_session
on new_user.medium = engaged_session.medium
ORDER BY user_per_engagement_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,new_user_count,engaged_session_count,user_per_engagement_count
0,(data deleted),6,3627,604.5
1,referral,9797,14218,1.451261
2,(none),16541,20104,1.215404
3,organic,26686,30558,1.145095
4,<Other>,12491,13404,1.073093
5,cpc,3835,4070,1.061278


### 평균 참여 시간

In [10]:
sql = """
SELECT
  traffic_source.medium,
  FLOOR(SUM(FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000)) / COUNT(DISTINCT user_pseudo_id)) AS engagement_time,
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY traffic_source.medium
ORDER BY engagement_time DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,engagement_time
0,(data deleted),133.0
1,referral,102.0
2,(none),94.0
3,organic,91.0
4,<Other>,84.0
5,cpc,81.0


In [11]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(event_name) AS event_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
GROUP BY traffic_source.medium
ORDER BY event_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,event_count
0,organic,486124
1,(none),326294
2,referral,265359
3,<Other>,200066
4,(data deleted),104218
5,cpc,59369


### 전환

In [12]:
sql = """
SELECT
  traffic_source.medium,
  COUNT(event_name) AS conversion_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _table_suffix BETWEEN "20201110" AND "20201206"
AND event_name IN("purchase","begin_checkout","first_visit","predict_ltv_payer","add_to_cart","view_item","view_cart","qualified_visit","Membership","predicted_top_spenders")
GROUP BY traffic_source.medium
ORDER BY conversion_count DESC"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,conversion_count
0,organic,84331
1,(none),55044
2,referral,42627
3,<Other>,35997
4,(data deleted),14314
5,cpc,10715


### 총수익

In [13]:
sql = """
WITH value AS(
  SELECT
    traffic_source.medium,
    CASE
      WHEN (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "value")
    END AS int_value,
    CASE
      WHEN (SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.float_value FROM UNNEST(event_params) WHERE key = "value")
    END AS float_value,
    CASE
      WHEN (SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value") IS NULL THEN 0
      ELSE (SELECT value.double_value FROM UNNEST(event_params) WHERE key = "value")
    END AS double_value
  FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
  WHERE _table_suffix BETWEEN "20201110" AND "20201206"
)

SELECT
  medium,
  ROUND(SUM(int_value + float_value + double_value),2) AS value_sum
FROM value
GROUP BY medium
ORDER BY value_sum DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,medium,value_sum
0,organic,43040.41
1,referral,34536.38
2,(none),31701.06
3,(data deleted),20798.09
4,<Other>,16632.07
5,cpc,3776.92
