In [1]:
import pandas as pd
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import seaborn as sns

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
# sql 추출 및 데이터 프레임 변환

def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

### Recency 얼마나 최근 구매했는가?

In [5]:
# 데이터 불러오기

sql = """
SELECT
  Customer_ID,
  max_order_date,
  CAST("2020-01-01" AS DATE) AS min_order_date,
  DATE_DIFF(max_order_date, CAST("2020-01-01" AS DATE), DAY) AS recency
FROM(
  SELECT
    Customer_ID,
    MAX(Order_Date) AS max_order_date
  FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
  GROUP BY Customer_ID
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,max_order_date,min_order_date,recency
0,NW-18400,2020-07-11,2020-01-01,192
1,TM-21010,2020-09-18,2020-01-01,261
2,ML-17755,2020-12-01,2020-01-01,335
3,RS-19870,2020-12-11,2020-01-01,345
4,PO-18850,2020-12-26,2020-01-01,360
...,...,...,...,...
688,MO-17950,2020-07-03,2020-01-01,184
689,JF-15565,2020-11-06,2020-01-01,310
690,KS-16300,2020-03-10,2020-01-01,69
691,EB-14170,2020-10-09,2020-01-01,282


### Frequency 얼마나 자주 구매했는가?

In [6]:
sql = """
SELECT
  Customer_ID,
  COUNT(DISTINCT Order_ID) AS frequency
FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
GROUP BY Customer_ID
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,frequency
0,NW-18400,1
1,TM-21010,3
2,ML-17755,6
3,RS-19870,1
4,PO-18850,5
...,...,...
688,MO-17950,1
689,JF-15565,1
690,KS-16300,1
691,EB-14170,1


### monetary 얼마나 많은 금액을 지출했는가?

In [7]:
sql = """
SELECT
  Customer_ID,
  SUM(Sales) AS monetary
FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
GROUP BY Customer_ID
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,monetary
0,NW-18400,30.320
1,TM-21010,536.336
2,ML-17755,1913.104
3,RS-19870,12.680
4,PO-18850,5308.107
...,...,...
688,MO-17950,24.000
689,JF-15565,831.368
690,KS-16300,61.876
691,EB-14170,10.816


### RFM 지표

In [8]:
sql = """
SELECT
  Customer_ID,
  max_order_date,
  CAST("2020-01-01" AS DATE) AS min_order_date,
  DATE_DIFF(max_order_date, CAST("2020-01-01" AS DATE), DAY) AS recency,
  frequency,
  monetary
FROM(
  SELECT
    Customer_ID,
    MAX(Order_Date) AS max_order_date,
    COUNT(DISTINCT Order_ID) AS frequency,
    SUM(Sales) AS monetary
  FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
  GROUP BY Customer_ID
)
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,max_order_date,min_order_date,recency,frequency,monetary
0,NW-18400,2020-07-11,2020-01-01,192,1,30.320
1,TM-21010,2020-09-18,2020-01-01,261,3,536.336
2,ML-17755,2020-12-01,2020-01-01,335,6,1913.104
3,RS-19870,2020-12-11,2020-01-01,345,1,12.680
4,PO-18850,2020-12-26,2020-01-01,360,5,5308.107
...,...,...,...,...,...,...
688,MO-17950,2020-07-03,2020-01-01,184,1,24.000
689,JF-15565,2020-11-06,2020-01-01,310,1,831.368
690,KS-16300,2020-03-10,2020-01-01,69,1,61.876
691,EB-14170,2020-10-09,2020-01-01,282,1,10.816


### RFM Score

In [9]:
sql = """
SELECT
  *,
  CASE
    WHEN recency >= PERCENTILE_CONT(recency,0.75) OVER() THEN 4
    WHEN recency >= PERCENTILE_CONT(recency,0.5) OVER() AND recency < PERCENTILE_CONT(recency,0.75) OVER() THEN 3
    WHEN recency >= PERCENTILE_CONT(recency,0.25) OVER() AND recency < PERCENTILE_CONT(recency,0.5) OVER() THEN 2
    WHEN recency < PERCENTILE_CONT(recency,0.25) OVER() THEN 1
  END AS recency_score,
  CASE
    WHEN frequency >= 1 AND frequency <= 2 THEN 1
    WHEN frequency >= 3 AND frequency <= 4 THEN 2
    WHEN frequency >= 5 AND frequency <= 6 THEN 3
    WHEN frequency >= 7 AND frequency <= 8 THEN 4
  END AS frequency_score,
  CASE
    WHEN monetary >= PERCENTILE_CONT(monetary,0.75) OVER() THEN 4
    WHEN monetary >= PERCENTILE_CONT(monetary,0.5) OVER() AND monetary < PERCENTILE_CONT(monetary,0.75) OVER() THEN 3
    WHEN monetary >= PERCENTILE_CONT(monetary,0.25) OVER() AND monetary < PERCENTILE_CONT(monetary,0.5) OVER() THEN 2
    WHEN monetary < PERCENTILE_CONT(monetary,0.25) OVER() THEN 1
  END AS monetary_score
FROM(
  SELECT
    Customer_ID,
    max_order_date,
    CAST("2020-01-01" AS DATE) AS min_order_date,
    DATE_DIFF(max_order_date, CAST("2020-01-01" AS DATE), DAY) AS recency,
    frequency,
    monetary
  FROM(
    SELECT
      Customer_ID,
      MAX(Order_Date) AS max_order_date,
      COUNT(DISTINCT Order_ID) AS frequency,
      SUM(Sales) AS monetary
    FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
    GROUP BY Customer_ID
  )
)
"""

df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,max_order_date,min_order_date,recency,frequency,monetary,recency_score,frequency_score,monetary_score
0,AB-10105,2020-11-19,2020-01-01,323,5,2291.044,3,3,4
1,DK-12895,2020-11-15,2020-01-01,319,3,1621.552,3,2,4
2,BT-11680,2020-12-10,2020-01-01,344,2,101.360,4,1,1
3,DW-13480,2020-05-27,2020-01-01,147,1,598.310,1,1,2
4,CD-12280,2020-11-18,2020-01-01,322,2,1205.584,3,1,3
...,...,...,...,...,...,...,...,...,...
688,MM-17920,2020-12-23,2020-01-01,357,4,242.996,4,2,2
689,NR-18550,2020-06-03,2020-01-01,154,1,1111.698,1,1,3
690,AJ-10945,2020-10-06,2020-01-01,279,2,648.074,2,1,3
691,TS-21205,2020-06-15,2020-01-01,166,1,2942.784,1,1,4
