## 월간 보고용 : 누적분석

In [1]:
import pandas as pd
import re
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import apriori, association_rules
import datetime as dt
import os
from dateutil.relativedelta import relativedelta

### 0. 경로 설정

In [2]:
input_path = 'C:/GMG_R&D/3. 장바구니 분석/1. 한총' 
os.chdir(input_path)
path=os.getcwd()
print(path)

x = dt.datetime.now()
file_name=f'월간보고서_{x.year}년_{(x - relativedelta(months=1)).month}월/누적_{x.month}월 {x.day}일'

if not os.path.exists(file_name):
    os.makedirs(file_name) 

C:\GMG_R&D\3. 장바구니 분석\1. 한총


### 1. 한총 데이터 로드

In [3]:
df= pd.read_clipboard(sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46536 entries, Model Code (v41) to RF60B91U2W6,WF21T9500KW7T,KQ75QB65-W1B5,DW60T7065SSS
Data columns (total 1 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Order (purchase event)  46536 non-null  int64
dtypes: int64(1)
memory usage: 727.1+ KB


In [4]:
df=df[1:]
df=df.reset_index().rename(columns={"index": "Model Code (v41)"})
df

Unnamed: 0,Model Code (v41),Order (purchase event)
0,"GP-FPR190HIBWK,SM-R190NZVAKOO",14590
1,"GP-FPR190HIBLK,SM-R190NZVAKOO",9676
2,"GP-FPR190HIBLK,SM-R190NZKAKOO",8483
3,"GP-FPR190HIBWK,SM-R190NZKAKOO",6120
4,"GP-FPR190HIBWK,SM-R190NZSAKOO",4957
...,...,...
46530,"SM-R860NZKAKOO,SM-R860NZKAKOO,EP-TA800XWKGKR",1
46531,"DF-WK,DF-AS3",1
46532,"sam0017,sam0019,sam0020",1
46533,"1CA10103,1CA10103,1CA10098,1CA10051,36266,36266",1


In [15]:
df['Order (purchase event)'].sum()

215551

### 2. 카테고리 파일 로드

In [9]:
#카테고리 가져오기
rex= pd.read_clipboard(sep='\t')
rex=rex.astype(str)
rex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Value             9 non-null      object
 1   Category2D_Value  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [6]:
#전체 SKU 하나의 리스트로
a = df['Model Code (v41)'].tolist()
b = ','.join(a) #리스트 문자열로
c = b.split(',') #문자열 개별 문자로 리스트화 
model = list(set(c)) #중복 제거

In [9]:
#전체 SKU 담을 딕셔너리
dic_category = {}

for m in model :
    flag = 1
    for i in range(len(rex)):
        r = re.compile(rex.Category2D_Value[i])
        if r.search(m):
            dic_category[m] = rex.depth1[i]+'_' +rex.Category1D[i] +'_'+ rex.Category2D[i]  #rex.Category1D[i] +'_'+  #sku 규칙을 알고자한다면, r.search(m).group()
            flag = 0
            break
    if flag :
        dic_category[m] = '기타'

In [11]:
#데이터셋 만들기
dataset = []    # 데이터셋 구조 [ [a,b], [a,c], [a,b,c] .... ]

for i in range(len(df)) :
    sku = df['Model Code (v41)'][i]
    revised = sku.split(",")
    code = []
    for j in range(len(revised)) :
        code.append(dic_category[revised[j]])   #각 SKU 리스트에 담기
        
    for k in range(df['Order (purchase event)'][i]) :   #order 수만큼 담기 Order (purchase event)
        dataset.append(code) 
    

In [13]:
# 기타: 시각적 제거
ho = []
rm_set = {'nan','기타'}
for s in dataset:
    arr_new = [i for i in s if i.split('_')[0] not in rm_set]
    ho.append(arr_new)
dataset=ho

## 장바구니 분석

In [14]:
len(dataset)

215551

In [16]:
#장바구니 분석
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset) #True / False
df2 = pd.DataFrame(te_ary, columns=te.columns_)
itemsets_data = apriori(df2, min_support=0.00001, use_colnames=True).sort_values('support', ascending=False).reset_index(drop=True)
rules = association_rules(itemsets_data, metric='lift', min_threshold=1).sort_values('lift',ascending=False).reset_index(drop=True)
rules['count']=(rules['support']*len(dataset)).astype(int)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,count
0,"(IM_PC/주변기기_노트북, CE_생활가전_청소기, CE_주방가전_큐커)","(IM_PC/주변기기_모니터, CE_생활가전_공기청정기)",0.000014,0.000056,0.000014,1.000000,17962.583333,1.391705e-05,inf,3
1,"(IM_PC/주변기기_모니터, CE_생활가전_공기청정기, CE_생활가전_청소기)","(IM_PC/주변기기_노트북, CE_주방가전_큐커)",0.000028,0.000028,0.000014,0.500000,17962.583333,1.391705e-05,1.999944,3
2,"(IM_PC/주변기기_노트북, CE_주방가전_큐커)","(IM_PC/주변기기_모니터, CE_생활가전_공기청정기, CE_생활가전_청소기)",0.000028,0.000028,0.000014,0.500000,17962.583333,1.391705e-05,1.999944,3
3,"(IM_PC/주변기기_모니터, CE_생활가전_공기청정기)","(IM_PC/주변기기_노트북, CE_생활가전_청소기, CE_주방가전_큐커)",0.000056,0.000014,0.000014,0.250000,17962.583333,1.391705e-05,1.333315,3
4,"(IM_PC/주변기기_노트북, CE_생활가전_공기청정기, CE_생활가전_청소기)","(IM_PC/주변기기_모니터, CE_주방가전_큐커)",0.000046,0.000019,0.000014,0.300000,16166.325000,1.391696e-05,1.428545,3
...,...,...,...,...,...,...,...,...,...,...
12039,(IM_PC/주변기기_모니터),(CE_TV_TV 액세서리),0.002148,0.008406,0.000019,0.008639,1.027711,5.003634e-07,1.000235,3
12040,(CE_TV_TV 액세서리),(CE_주방가전_김치냉장고),0.008406,0.006500,0.000056,0.006623,1.018908,1.033097e-06,1.000124,12
12041,(CE_주방가전_김치냉장고),(CE_TV_TV 액세서리),0.006500,0.008406,0.000056,0.008565,1.018908,1.033097e-06,1.000160,12
12042,(IM_PC/주변기기_PC 액세서리),(CE_생활가전_세탁기),0.002023,0.015992,0.000032,0.016055,1.003969,1.283838e-07,1.000065,6


In [17]:
x = dt.datetime.now()

num_conq=1
N=4

def frozen_convert(sets) : 
    return [list(sets)]

for num_ant in [1]:
    df = rules[rules.apply(lambda x:True if len(x.antecedents) == int(num_ant) else False, axis=1) & rules.apply(lambda x:True if len(x.consequents) == int(num_conq) else False, axis=1)]
    df = df.sort_values('confidence', ascending=False)
    #df = df.round(N)
    df[['antecedents','consequents']] =df[['antecedents','consequents']].applymap(lambda x: frozen_convert(x))
    df['antecedents']=df['antecedents'].astype(str).apply(lambda x: x.replace('[[','['))
    df['antecedents']=df['antecedents'].apply(lambda x: x.replace(']]',']'))
    df['consequents']=df['consequents'].astype(str).apply(lambda x: x.replace('[[','['))
    df['consequents']=df['consequents'].apply(lambda x: x.replace(']]',']'))
    df[['antecedents','consequents','count','confidence','support',	'lift',	'conviction']].to_excel(f'{path}/{file_name}/누적_장바구니_결과.xlsx')

