In [1]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("./*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### 시간별 세션수

In [4]:
sql = """
SELECT
  event_date,
  page_location,
  COUNT(DISTINCT user) AS session
FROM(
  SELECT
    event_date,
    FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
    user
  FROM(
    SELECT
      event_date
      ,DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
      user_pseudo_id,
      (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
      (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
      CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND event_name = "page_view"
      )
    )
GROUP BY event_date, page_location
ORDER BY session DESC
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,event_date,page_location,session
0,20201203,https://shop.googlemerchandisestore.com/,1431
1,20201201,https://shop.googlemerchandisestore.com/,1389
2,20201202,https://shop.googlemerchandisestore.com/,1377
3,20201120,https://shop.googlemerchandisestore.com/,1235
4,20201130,https://shop.googlemerchandisestore.com/,1220
...,...,...,...
4716,20201123,https://shop.googlemerchandisestore.com/Wearab...,1
4717,20201126,https://shop.googlemerchandisestore.com/Google...,1
4718,20201204,https://shop.googlemerchandisestore.com/google...,1
4719,20201127,https://shop.googlemerchandisestore.com/),1


### 세션수

In [5]:
sql = """
SELECT
  page_location,
  COUNT(DISTINCT user) AS session
FROM(
  SELECT
    FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
    user
  FROM(
    SELECT
      DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
      user_pseudo_id,
      (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
      (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
      CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND event_name = "page_view"
      )
    )
GROUP BY page_location
ORDER BY session DESC
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,page_location,session
0,https://shop.googlemerchandisestore.com/,27326
1,https://googlemerchandisestore.com/,15260
2,https://shop.googlemerchandisestore.com/Google...,7352
3,https://shop.googlemerchandisestore.com/Google...,6173
4,https://www.googlemerchandisestore.com/,5482
...,...,...
732,https://shop.googlemerchandisestore.com/Google...,1
733,https://shop.googlemerchandisestore.com/Google...,1
734,https://shop.googlemerchandisestore.com/shop.a...,1
735,http://shop.googlemerchandisestore.com/Google ...,1


### 사용자수

In [6]:
sql = """
SELECT
  page_location,
  COUNT(DISTINCT user_pseudo_id) AS user_count
FROM(
  SELECT
    FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
    user,
    user_pseudo_id
  FROM(
    SELECT
      DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
      user_pseudo_id,
      (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
      (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
      CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND event_name = "page_view"
      )
    )
GROUP BY page_location
ORDER BY user_count DESC
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,page_location,user_count
0,https://shop.googlemerchandisestore.com/,23300
1,https://googlemerchandisestore.com/,13352
2,https://shop.googlemerchandisestore.com/Google...,7051
3,https://shop.googlemerchandisestore.com/Google...,5906
4,https://www.googlemerchandisestore.com/,5202
...,...,...
732,https://shop.googlemerchandisestore.com/Google...,1
733,https://shop.googlemerchandisestore.com/Eco/Or...,1
734,https://shop.googlemerchandisestore.com/google...,1
735,https://shop.googlemerchandisestore.com/google...,1


### 새사용자수

In [7]:
sql = """

SELECT
  page_location,
  COUNT(DISTINCT user_pseudo_id) AS user_count
FROM(
  SELECT
    FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
    user,
    user_pseudo_id
  FROM(
    SELECT
      DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
      user_pseudo_id,
      (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
      (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
      CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE _table_suffix BETWEEN "20201110" AND "20201206"
    AND event_name = "first_visit"
      )
    )
GROUP BY page_location
ORDER BY user_count DESC
"""

df = sql_to_dataframe(sql)
df


Unnamed: 0,page_location,user_count
0,https://shop.googlemerchandisestore.com/,19140
1,https://googlemerchandisestore.com/,11698
2,https://shop.googlemerchandisestore.com/Google...,6814
3,https://shop.googlemerchandisestore.com/Google...,5409
4,https://www.googlemerchandisestore.com/,4572
...,...,...
584,https://shop.googlemerchandisestore.com/Google...,1
585,https://shop.googlemerchandisestore.com/Google...,1
586,https://shop.googlemerchandisestore.com/eco/an...,1
587,https://shop.googlemerchandisestore.com/google...,1


### 세션당 평균 참여 시간

In [8]:
sql = """
SELECT
  user_count.page_location,
  user_engagement.engagement_time,
  user_count.session,
  FLOOR(user_engagement.engagement_time / user_count.session) AS session_per_engagement_time
FROM(
  SELECT
    page_location,
    COUNT(DISTINCT user) AS session
  FROM(
    SELECT
      FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
      user
    FROM(
      SELECT
        DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
        user_pseudo_id,
        (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
        (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
        CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user
      FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
      WHERE _table_suffix BETWEEN "20201110" AND "20201206"
      AND event_name = "page_view"
        )
      )
  GROUP BY page_location
  ORDER BY session DESC
  ) user_count
LEFT JOIN(
  SELECT
    page_location,
    SUM(engagement_time) AS engagement_time
  FROM(
    SELECT
      user_time.user,
      user_time.engagement_time,
      user_session.page_location
    FROM(
      SELECT
        user,
        SUM(engagement_time) AS engagement_time
      FROM(
        SELECT
          DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
          user_pseudo_id,
          (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
          (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
          CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user,
          FLOOR((SELECT value.int_value FROM UNNEST(event_params) WHERE key = "engagement_time_msec") / 1000) AS engagement_time
        FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
        WHERE _table_suffix BETWEEN "20201110" AND "20201206"
          )
      GROUP BY user
        ) user_time
    LEFT JOIN(
      SELECT
        DISTINCT page_location, user
      FROM(
        SELECT
          FIRST_VALUE(page_location) OVER(PARTITION BY user ORDER BY event_time) AS page_location,
          user
        FROM(
          SELECT
            DATETIME(TIMESTAMP_SECONDS(CAST(CAST(event_timestamp AS INT64) / 1000000 AS INT64)), "Asia/Seoul") AS event_time,
            user_pseudo_id,
            (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id") AS ga_session_id,
            (SELECT value.string_value FROM UNNEST(event_params) WHERE key = "page_location") AS page_location,
            CONCAT(user_pseudo_id, (SELECT value.int_value FROM UNNEST(event_params) WHERE key = "ga_session_id")) AS user,
          FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
          WHERE _table_suffix BETWEEN "20201110" AND "20201206"
          AND event_name = "page_view"
            )
          )
        ) user_session
    ON user_time.user = user_session.user
    )
  GROUP BY page_location
  ) user_engagement
ON user_count.page_location = user_engagement.page_location
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,page_location,engagement_time,session,session_per_engagement_time
0,https://shop.googlemerchandisestore.com/Google...,235866.0,2121,111.0
1,https://shop.googlemerchandisestore.com/Google...,11560.0,160,72.0
2,https://shop.googlemerchandisestore.com/Google...,359249.0,6173,58.0
3,https://shop.googlemerchandisestore.com/,4199506.0,27326,153.0
4,https://googlemerchandisestore.com/,838786.0,15260,54.0
...,...,...,...,...
732,https://shop.googlemerchandisestore.com/google...,111.0,1,111.0
733,https://shop.googlemerchandisestore.com/Google...,,1,
734,https://shop.googlemerchandisestore.com/signin...,2.0,1,2.0
735,https://shop.googlemerchandisestore.com/google...,2.0,1,2.0
