In [1]:
import pandas as pd
import re
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import apriori, association_rules
import datetime as dt
from dateutil.relativedelta import relativedelta
import os
from collections import Counter  

In [2]:
input_path = 'C:/GMG_R&D/3. 장바구니 분석/1. 한총' 
os.chdir(input_path)
path=os.getcwd()
print(path)

x = dt.datetime.now()
file_name=f'분기보고서_{x.year}년_{(x - relativedelta(months=1)).month}월/분기_{x.month}월 {x.day}일'

if not os.path.exists(file_name):
    os.makedirs(file_name) 

C:\GMG_R&D\3. 장바구니 분석\1. 한총


In [3]:
#########  필요한 함수 정의 #########
#1. 데이터셋 만들기
def dataset_make(dic_category, month_df):
    dataset = []   
    for i in range(len(month_df)) :
        sku = month_df['Model Code (v41)'][i]
        revised = sku.split(",")
        code = []
        for j in range(len(revised)) :
            code.append(dic_category[revised[j]])   
        for k in range(month_df[month_df.columns[1]][i]) :  
            dataset.append(code) 
    return dataset

#2. 주문건수
def order_2depth(new_rex, dataset):
    per_order={}
    for PRODU in new_rex['Category2D']:
        df_count=[s for s in dataset if any(PRODU in l for l in s)] 
        per_order[PRODU]=len(df_count)

    daf=pd.DataFrame.from_dict(per_order, orient="index").reset_index()
    daf_1=daf.rename({'index':'Category2D',0:"count"},axis=1)
    return daf_1



def frozen_convert(sets) : 
    return [list(sets)]


#4. 장바구니 분석
def my_basket(dataset):
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset) #True / False
    df2 = pd.DataFrame(te_ary, columns=te.columns_)
    itemsets_data = apriori(df2, min_support=0.00001, use_colnames=True).sort_values('support', ascending=False).reset_index(drop=True)
    rules = association_rules(itemsets_data, metric='lift', min_threshold=1).sort_values('lift',ascending=False).reset_index(drop=True)
    rules['count']=(rules['support']*len(dataset)).astype(int)
    basket = rules[rules.apply(lambda x:True if len(x.antecedents) == int(1) else False, axis=1) & rules.apply(lambda x:True if len(x.consequents) == int(1) else False, axis=1)]
    basket = basket.sort_values('confidence', ascending=False)
    basket[['antecedents','consequents']] =basket[['antecedents','consequents']].applymap(lambda x: frozen_convert(x))
    basket['antecedents']=basket['antecedents'].astype(str).apply(lambda x: x.replace('[[','['))
    basket['antecedents']=basket['antecedents'].apply(lambda x: x.replace(']]',']'))
    basket['consequents']=basket['consequents'].astype(str).apply(lambda x: x.replace('[[','['))
    basket['consequents']=basket['consequents'].apply(lambda x: x.replace(']]',']'))
    basket=basket[['antecedents','consequents','count','confidence','support','lift','conviction']]
    return basket

In [4]:
#카테고리 가져오기
rex= pd.read_clipboard(sep='\t')
rex=rex.astype(str)
rex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   depth1            41 non-null     object
 1   Category1D        41 non-null     object
 2   Category2D        41 non-null     object
 3   Category2D_Value  41 non-null     object
dtypes: object(4)
memory usage: 1.4+ KB


In [5]:
df= pd.read_clipboard(sep='\t')
df=df[1:]
df=df.reset_index().rename(columns={"index": "Model Code (v41)"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Model Code (v41)        10001 non-null  object
 1   Order (purchase event)  10001 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [6]:
#전체
ord_df=df

#전체 SKU 하나의 리스트로
a = ord_df['Model Code (v41)'].tolist()
b = ','.join(a) #리스트 문자열로
c = b.split(',') #문자열 개별 문자로 리스트화 
model = list(set(c)) #중복 제거

In [7]:
## 1. 카테고리 생성 : 부문별_카테고리 1depth_카테고리 2depth
dic_category_2depth = {}
for m in model :
    flag = 1
    for i in range(len(rex)):
        r = re.compile(rex.Category2D_Value[i])
        if r.search(m):
            dic_category_2depth[m] = rex.depth1[i]+'_'+rex.Category1D[i] +'_'+ rex.Category2D[i]  
            flag = 0
            break
    if flag :
        dic_category_2depth[m] = '기타'

## 2. 데이터셋
dataset_2depth=dataset_make(dic_category_2depth, ord_df)


## 4-1. 분석 : 2depth 주문건수
all_order=order_2depth(rex, dataset_2depth)
#all_order.sort_values(["count"]).to_csv(f'{path}/{file_name}/[분기] 전체_주문 건수_2depth.csv',index=False, encoding='utf-8-sig')
## 4-2. 분석 : 장바구니 분석
my_basket(dataset_2depth).to_excel(f'{path}/{file_name}/[분기] 전체_장바구니 분석_2depth.xlsx',index=False)

In [8]:
# 동시구매 
month_df=df[df['Model Code (v41)'].str.contains(',')]
month_df=month_df[month_df[month_df.columns[1]]!=0]  
month_df=month_df.reset_index(drop=True)

#전체 SKU 하나의 리스트로
a = month_df['Model Code (v41)'].tolist()
b = ','.join(a) #리스트 문자열로
c = b.split(',') #문자열 개별 문자로 리스트화 
model = list(set(c)) #중복 제거

In [11]:
## 1. 카테고리 생성 : 부문별_카테고리 1depth_카테고리 2depth
dic_category_2depth = {}
for m in model :
    flag = 1
    for i in range(len(rex)):
        r = re.compile(rex.Category2D_Value[i])
        if r.search(m):
            dic_category_2depth[m] = rex.depth1[i]+'_'+rex.Category1D[i] +'_'+ rex.Category2D[i]  
            flag = 0
            break
    if flag :
        dic_category_2depth[m] = '기타'

## 2. 데이터셋
dataset_2depth=dataset_make(dic_category_2depth, month_df)


## 4-1. 분석 : 2depth 주문건수
to_order=order_2depth(rex, dataset_2depth)
#to_order.sort_values(["count"]).to_csv(f'{path}/{file_name}/[분기] 동시구매_주문 건수_2depth.csv',index=False, encoding='utf-8-sig')


In [15]:
top=pd.merge(all_order,to_order, how='inner', on='Category2D',suffixes=('_all','_two') )
top['rate(%)']=(top['count_two']/top['count_all'])*100
top

Unnamed: 0,Category2D,count_all,count_two,rate(%)
0,기타 배제,0,0,
1,smartthings,921,388,42.128122
2,기타 제품,253,81,32.01581
3,음식,3004,1464,48.73502
4,기타 소형가전,144,10,6.944444
5,삼성 사운드바,198,35,17.676768
6,삼성 오디오,1268,145,11.435331
7,라이프스타일 오디오,2486,110,4.424779
8,PC 액세서리,382,33,8.638743
9,메모리/스토리지,0,0,


In [18]:
top.sort_values('rate(%)',ascending=False).to_excel(f'{path}/{file_name}/[분기] sort_top selling search.xlsx',index=False)
pd.merge(rex[['Category1D','Category2D']], top, how='inner', on='Category2D').to_excel(f'{path}/{file_name}/[분기] 1d_top selling search.xlsx',index=False)