In [53]:
import torch.nn as nn
import glob
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import seaborn as sns
import numpy as np
import torch
from torch.utils.data import DataLoader,TensorDataset
from tqdm import tqdm_notebook

In [2]:
# 파이썬 구글 빅쿼리 연동 코드
# json 파일

key_path = glob.glob("*.json")[0]
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials = credentials, 
                         project = credentials.project_id)

In [3]:
# sql 추출 및 데이터 프레임 변환

def sql_to_dataframe(sql:str) -> pd.DataFrame:
    """
    Args:
        sql (str): sql for extraction

    Returns:
        pd.DataFrame: extract data with sql
    """
    query_job = client.query(sql)
    df = query_job.to_dataframe()
    return df

In [32]:
# 데이터 불러오기

sql = """
SELECT
  *,
  CASE
    WHEN recency >= PERCENTILE_CONT(recency,0.75) OVER() THEN 4
    WHEN recency >= PERCENTILE_CONT(recency,0.5) OVER() AND recency < PERCENTILE_CONT(recency,0.75) OVER() THEN 3
    WHEN recency >= PERCENTILE_CONT(recency,0.25) OVER() AND recency < PERCENTILE_CONT(recency,0.5) OVER() THEN 2
    WHEN recency < PERCENTILE_CONT(recency,0.25) OVER() THEN 1
  END AS recency_score,
  CASE
    WHEN frequency >= 1 AND frequency <= 2 THEN 1
    WHEN frequency >= 3 AND frequency <= 4 THEN 2
    WHEN frequency >= 5 AND frequency <= 6 THEN 3
    WHEN frequency >= 7 AND frequency <= 8 THEN 4
  END AS frequency_score,
  CASE
    WHEN monetary >= PERCENTILE_CONT(monetary,0.75) OVER() THEN 4
    WHEN monetary >= PERCENTILE_CONT(monetary,0.5) OVER() AND monetary < PERCENTILE_CONT(monetary,0.75) OVER() THEN 3
    WHEN monetary >= PERCENTILE_CONT(monetary,0.25) OVER() AND monetary < PERCENTILE_CONT(monetary,0.5) OVER() THEN 2
    WHEN monetary < PERCENTILE_CONT(monetary,0.25) OVER() THEN 1
  END AS monetary_score
FROM(
  SELECT
    Customer_ID,
    max_order_date,
    CAST("2020-01-01" AS DATE) AS min_order_date,
    DATE_DIFF(max_order_date, CAST("2020-01-01" AS DATE), DAY) AS recency,
    frequency,
    monetary
  FROM(
    SELECT
      Customer_ID,
      MAX(Order_Date) AS max_order_date,
      COUNT(DISTINCT Order_ID) AS frequency,
      SUM(Sales) AS monetary
    FROM `rfm-analysis-392707.rfm_analysis.rfm_analysis`
    GROUP BY Customer_ID
  )
)
"""
df = sql_to_dataframe(sql)
df

Unnamed: 0,Customer_ID,max_order_date,min_order_date,recency,frequency,monetary,recency_score,frequency_score,monetary_score
0,AB-10105,2020-11-19,2020-01-01,323,5,2291.044,3,3,4
1,DK-12895,2020-11-15,2020-01-01,319,3,1621.552,3,2,4
2,BT-11680,2020-12-10,2020-01-01,344,2,101.360,4,1,1
3,DW-13480,2020-05-27,2020-01-01,147,1,598.310,1,1,2
4,CD-12280,2020-11-18,2020-01-01,322,2,1205.584,3,1,3
...,...,...,...,...,...,...,...,...,...
688,MM-17920,2020-12-23,2020-01-01,357,4,242.996,4,2,2
689,NR-18550,2020-06-03,2020-01-01,154,1,1111.698,1,1,3
690,AJ-10945,2020-10-06,2020-01-01,279,2,648.074,2,1,3
691,TS-21205,2020-06-15,2020-01-01,166,1,2942.784,1,1,4


In [33]:
n = 100
weights = []
for i in range(n + 1):
    for j in range(n + 1 - i):
        weights.append([j / n , i/n, (n-i-j)/n])

In [56]:
max_std = 0
total_monetary = df["monetary"].sum()
total_monetary
max_std_monetary = 0
for w in tqdm_notebook(weights):
    score = df["recency_score"] * w[0] + df["frequency_score"] * w[1] + df["monetary_score"] * w[2]
    scores = pd.cut(score, bins = [0,1,2,3,4],labels= ["4등급","3등급","2등급","1등급"])
    data = pd.DataFrame({"rank":scores, "monetary":df["monetary"]})
    rank_monetary = data.groupby(["rank"]).agg({"monetary":sum}).reset_index()
    rank_monetary = rank_monetary["monetary"].map(lambda x: x / total_monetary)
    std_monetary = rank_monetary.std()
    if std_monetary > max_std_monetary:
        max_std_monetary = std_monetary
        best_weight = [w[0],w[1],w[2]]
print("best Recency의 가중치는: {}, best Frequency의 가중치는: {}, best Monetary의 가중치는: {} 입니다.".format(best_weight[0],best_weight[1],best_weight[2]))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for w in tqdm_notebook(weights):


  0%|          | 0/5151 [00:00<?, ?it/s]

best Recency의 가중치는: 0.2, best Frequency의 가중치는: 0.0, best Monetary의 가중치는: 0.8


In [81]:
final_score = df["recency_score"] * best_weight[0] + df["frequency_score"] * best_weight[1] + df["monetary_score"] * best_weight[2]
final_scores = pd.cut(final_score, bins = [0,1,2,3,4],labels= ["4등급","3등급","2등급","1등급"])
data = pd.DataFrame({"customer_id": df["Customer_ID"], "rank":final_scores, "monetary":df["monetary"]})
data = data.groupby(["rank"]).agg({"customer_id":"count","monetary":"sum"})
data["contribution_ratio"] = data["monetary"].map(lambda x: x / total_monetary * 100)
data = data.reset_index()
data

Unnamed: 0,rank,customer_id,monetary,contribution_ratio
0,4등급,64,5132.122,0.699948
1,3등급,201,43391.5412,5.917981
2,2등급,150,95145.8848,12.976528
3,1등급,278,589545.7072,80.405543


In [87]:
data.sort_values("rank",ascending=False)

Unnamed: 0,rank,customer_id,monetary,contribution_ratio
3,1등급,278,589545.7072,80.405543
2,2등급,150,95145.8848,12.976528
1,3등급,201,43391.5412,5.917981
0,4등급,64,5132.122,0.699948
