<a href="https://colab.research.google.com/github/sunnyl94/Data_Analysis/blob/main/Retail_Case_Study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **마케팅 고객 데이터 분석**



> Business Problem

> Data Description

출처 https://www.kaggle.com/datasets/darpan25bajaj/retail-case-study-data



In [39]:
# 필요한 library 설치
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [40]:
# 데이터 임포트
customer = pd.read_csv("/content/drive/MyDrive/workspace/Data Analysis/Retail Case Study/data/Customer.csv")
transaction = pd.read_csv("/content/drive/MyDrive/workspace/Data Analysis/Retail Case Study/data/Transactions.csv")
prod_info = pd.read_csv("/content/drive/MyDrive/workspace/Data Analysis/Retail Case Study/data/prod_cat_info.csv")

In [41]:
print("customer: ", customer.shape)
print("transaction: ", transaction.shape)
print("prod_info: ", prod_info.shape)

customer:  (5647, 4)
transaction:  (23051, 10)
prod_info:  (23, 4)


# **데이터 전처리**




## customer table

In [42]:
# DOB (Date of Birth) 컬럼이 object 타입으로 설정되어 있어 날짜로 변환 필요
customer.dtypes

customer_Id      int64
DOB             object
Gender          object
city_code      float64
dtype: object

In [43]:
customer["DOB"] = pd.to_datetime(customer["DOB"])
customer.dtypes

customer_Id             int64
DOB            datetime64[ns]
Gender                 object
city_code             float64
dtype: object

## transaction table

In [44]:
# null value가 없다
transaction.isnull().sum()

transaction_id      0
cust_id             0
tran_date           0
prod_subcat_code    0
prod_cat_code       0
Qty                 0
Rate                0
Tax                 0
total_amt           0
Store_type          0
dtype: int64

In [45]:
# tran_date 컬럼이 object 타입으로 설정되어 있어 날짜로 변환이 필요
transaction.dtypes

transaction_id        int64
cust_id               int64
tran_date            object
prod_subcat_code      int64
prod_cat_code         int64
Qty                   int64
Rate                  int64
Tax                 float64
total_amt           float64
Store_type           object
dtype: object

In [46]:
transaction["tran_date"] = pd.to_datetime(transaction['tran_date'])
transaction.dtypes

transaction_id               int64
cust_id                      int64
tran_date           datetime64[ns]
prod_subcat_code             int64
prod_cat_code                int64
Qty                          int64
Rate                         int64
Tax                        float64
total_amt                  float64
Store_type                  object
dtype: object

## prod_info table

In [47]:
# null value가 없다
prod_info.isnull().sum()

prod_cat_code        0
prod_cat             0
prod_sub_cat_code    0
prod_subcat          0
dtype: int64

In [48]:
# 각 컬럼 타입들이 잘 매칭되어 있다
prod_info.dtypes

prod_cat_code         int64
prod_cat             object
prod_sub_cat_code     int64
prod_subcat          object
dtype: object

## 파생 데이터

> 주문 건수, 취소 건수, 상품 전환율을 분석하기 위해 거래 상태를 구분할 수 있는 컬럼이 필요

> transaction table에 "Order_status" 컬럼을 추가
*   상품이 취소 되었으면 "cancel"
*   상품이 주문 되었으면 "order"







In [49]:
transaction["order_status"] = np.where(transaction["Qty"] < 0, "cancel", "order")
transaction.head()

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status
0,80712190438,270351,2014-02-28,1,1,-5,-772,405.3,-4265.3,e-Shop,cancel
1,29258453508,270384,2014-02-27,5,3,-5,-1497,785.925,-8270.925,e-Shop,cancel
2,51750724947,273420,2014-02-24,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel
3,93274880719,271509,2014-02-24,11,6,-3,-1363,429.345,-4518.345,e-Shop,cancel
4,51750724947,273420,2014-02-23,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel


In [50]:
transaction.shape

(23051, 11)



> transaction table에 "confirmed" 컬럼을 추가

1.   상품이 [주문 후 취소] 되었거나 혹은 원래 [취소] 상태였으면 되었으면 "cancel"
2.   상품이 주문 [완료] 되었으면 "confirmed"





In [51]:
# 주문이 완전히 취소된 행만을 추출한 후 "Order_status"를 새로 추가할 "confirmed"라는 컬럼명으로 바꾼다
cancel_confirmed = transaction[transaction["order_status"] == "cancel"][["transaction_id", "order_status"]].drop_duplicates().rename(columns={"order_status":"confirmed"})

# transaction table과 cancel_confirmed table을 합쳐 주문 후 취소한 상품 표시
transaction = pd.merge(transaction, cancel_confirmed, on="transaction_id", how="left")

# NaN value를 주문 되었다는 "confirmed"로 바꾸기
transaction["confirmed"] = transaction["confirmed"].replace(np.nan, "confirm")

transaction.head()

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status,confirmed
0,80712190438,270351,2014-02-28,1,1,-5,-772,405.3,-4265.3,e-Shop,cancel,cancel
1,29258453508,270384,2014-02-27,5,3,-5,-1497,785.925,-8270.925,e-Shop,cancel,cancel
2,51750724947,273420,2014-02-24,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel,cancel
3,93274880719,271509,2014-02-24,11,6,-3,-1363,429.345,-4518.345,e-Shop,cancel,cancel
4,51750724947,273420,2014-02-23,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel,cancel


In [52]:
num_cancel = transaction[transaction["order_status"] == "cancel"]["order_status"].count()
num_order = transaction[transaction["order_status"] == "order"]["order_status"].count()
num_confirm = transaction[transaction["confirmed"] == "confirm"]["confirmed"].count()

print(num_order, num_cancel, num_order - num_cancel, num_confirm)

20876 2175 18701 18819



> 문제: 총 주문 건수에서 총 취소 건수를 뺀 결과값과 "confirmed" 컬럼에서 확정된 총 주문수 불일치<br>
이상치 혹은 결측치가 존재한다는 뜻이기에 제거 필요




## 이상치 / 결측치 데이터 처리

### 이상치

In [53]:
transaction["transaction_id"].value_counts()

4170892941     4
32263938079    4
426787191      4
38053958046    3
89232425133    3
              ..
90835310705    1
36310127403    1
8422252533     1
96247253460    1
77960931771    1
Name: transaction_id, Length: 20876, dtype: int64



> 하나의 transaction id에 최대 2개 row가 존재할 수 있지만 확인 결과 3개 이상의 row가 있는것이 확인 됨

> 해결 방법: 중복으로 들어간 주문들을 제거해야 한다




In [54]:
# transaction id는 같아도 tran_date가 다를 수 있기 때문에 drop_duplicates()는 사용 불가
# transaction id와 order_status를 비교 후 중복으로 들어간 행만 제거 필요


# transaction id와 order_status의 내용을 합친 리스트를 만들고 transaction table에 추가
key = map(lambda id, status : str(id) + str(status), transaction["transaction_id"], transaction["order_status"])
transaction["key"] = list(key)
transaction.head()

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status,confirmed,key
0,80712190438,270351,2014-02-28,1,1,-5,-772,405.3,-4265.3,e-Shop,cancel,cancel,80712190438cancel
1,29258453508,270384,2014-02-27,5,3,-5,-1497,785.925,-8270.925,e-Shop,cancel,cancel,29258453508cancel
2,51750724947,273420,2014-02-24,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel,cancel,51750724947cancel
3,93274880719,271509,2014-02-24,11,6,-3,-1363,429.345,-4518.345,e-Shop,cancel,cancel,93274880719cancel
4,51750724947,273420,2014-02-23,6,5,-2,-791,166.11,-1748.11,TeleShop,cancel,cancel,51750724947cancel


In [55]:
# transaction table을 cust_id, transaction_id, tran_date 순으로 정렬 
transaction.sort_values(by=["cust_id", "transaction_id", "tran_date"], inplace=True, ignore_index=True)

In [56]:
# duplicate가 있는지 확인
# 1이면 중복이 아니고 0이면 중복
transaction["chk"] = np.where(transaction["key"] != transaction["key"].shift(-1), 1, 0)

# 1인것만 남기기
filtered_trans = transaction[transaction["chk"] == 1].reset_index(drop=True)

In [57]:
num_cancel = filtered_trans[filtered_trans["order_status"] == "cancel"]["order_status"].count()
num_order = filtered_trans[filtered_trans["order_status"] == "order"]["order_status"].count()
num_confirm = filtered_trans[filtered_trans["confirmed"] == "confirm"]["confirmed"].count()

print(num_order, num_cancel, num_order - num_cancel, num_confirm)

20876 2057 18819 18819


### 결측치

In [58]:
# Gender와 city_code 컬럼에 각각 2개의 null value가 존재
customer.isnull().sum()

customer_Id    0
DOB            0
Gender         2
city_code      2
dtype: int64

In [59]:
# Gener는 M,F,none으로 세개의 범주를 만들어 결측치 해결
customer.fillna(value={"Gender":"none"}, inplace=True)

In [60]:
# city_code는 최빈값으로 결측치 해결
customer.fillna(value={"city_code":customer["city_code"].mode()[0]}, inplace=True)

## 테이블 조인

> Transaction Table안에 category code가 존재하지만 명칭이 없기 때문에 Transaction table과 prod_cat_info Table을 조인 필요



In [61]:
filtered_trans.head()

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status,confirmed,key,chk
0,8410316370,266783,2013-02-20,4,1,1,869,91.245,960.245,e-Shop,order,confirm,8410316370order,1
1,16999552161,266783,2013-02-09,10,5,2,835,175.35,1845.35,e-Shop,order,confirm,16999552161order,1
2,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,order,cancel,25890929042order,1
3,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,cancel,cancel,25890929042cancel,1
4,98477711300,266783,2012-10-21,4,1,3,93,29.295,308.295,TeleShop,order,confirm,98477711300order,1


In [62]:
prod_info.head()

Unnamed: 0,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1,Clothing,4,Mens
1,1,Clothing,1,Women
2,1,Clothing,3,Kids
3,2,Footwear,1,Mens
4,2,Footwear,3,Women


In [63]:
# prod_cat_code 컬럼명은 같지만 prod_subcat_code 컬럼명은 다르기 때문에 조인을 쉽게 하기 위해 prod_info 테이블의 컬럼명을 transaction 테이블의 컬럼명과 똑같이 만들어 준다

prod_info.rename(columns= {"prod_sub_cat_code":"prod_subcat_code"}, inplace=True)
prod_info.head()

Unnamed: 0,prod_cat_code,prod_cat,prod_subcat_code,prod_subcat
0,1,Clothing,4,Mens
1,1,Clothing,1,Women
2,1,Clothing,3,Kids
3,2,Footwear,1,Mens
4,2,Footwear,3,Women


In [64]:
# prod_info 테이블과 transaction 테이블 조인 진행

new_data = pd.merge(
    left = filtered_trans,
    right = prod_info,
    on = ["prod_cat_code", "prod_subcat_code"],
    how = "left"
)

new_data.head()

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status,confirmed,key,chk,prod_cat,prod_subcat
0,8410316370,266783,2013-02-20,4,1,1,869,91.245,960.245,e-Shop,order,confirm,8410316370order,1,Clothing,Mens
1,16999552161,266783,2013-02-09,10,5,2,835,175.35,1845.35,e-Shop,order,confirm,16999552161order,1,Books,Non-Fiction
2,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,order,cancel,25890929042order,1,Footwear,Mens
3,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,cancel,cancel,25890929042cancel,1,Footwear,Mens
4,98477711300,266783,2012-10-21,4,1,3,93,29.295,308.295,TeleShop,order,confirm,98477711300order,1,Clothing,Mens


# 데이터 분석

## 상품 전환율 (주문 거래 건 중 취소가 발생되지 않고 구매가 확정된 거래 건 비중)



> 카테고리별로 상품 전환율의 차이는 어떻게 되는가?

* 고관여 상품군의 상품 전환율이 높을 것인가?<br>
        고관여 상픔의 특징
        1. 구매 전 상품에 대한 탐색 과정이 길다
        2. 거래 비용 발생이 상대적으로 크다



In [121]:
# prod_cat별 최종 구매 및 취소 수
cvr = new_data.groupby(["prod_cat"])["confirmed"].value_counts().unstack()

# 전체 구매
cvr["total"] = new_data.groupby(["prod_cat"])["confirmed"].count()

# 상품 전환율
# cvr["CVR(%)"] = ((cvr["confirm"] / cvr["total"]) * 100).round(1)

In [110]:
# prod_subcat별 최종 구매 및 취소 수
cvr_subcat = new_data.groupby(["prod_cat", "prod_subcat"])["confirmed"].value_counts().unstack()

# 전체 구매
cvr_subcat["total"] = new_data.groupby(["prod_cat", "prod_subcat"])["confirmed"].count()

# 상품 전환율
# cvr_subcat["CVR(%)"] = ((cvr_subcat["confirm"] / cvr_subcat["total"]) * 100).round(1)

In [133]:
temp = cvr

In [114]:
type(cvr_subcat)

pandas.core.frame.DataFrame

In [131]:
cvr.index

Index(['Bags', 'Books', 'Clothing', 'Electronics', 'Footwear',
       'Home and kitchen'],
      dtype='object', name='prod_cat')

In [134]:
sum = temp.sum()
sum.name = "Total"

temp = temp.append(sum.transpose())
temp

confirmed,cancel,confirm,total
prod_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bags,392,1593,1985
Books,1106,4933,6039
Clothing,540,2406,2946
Electronics,784,4093,4877
Footwear,544,2436,2980
Home and kitchen,748,3358,4106
Total,4114,18819,22933


In [140]:
temp = cvr_subcat
sum = temp.sum()
sum.name = "(Total, Total)"

temp = temp.append(sum.transpose())
temp

confirmed,cancel,confirm,total,CVR(%)
"(Bags, Mens)",188.0,809.0,997.0,81.1
"(Bags, Women)",204.0,784.0,988.0,79.4
"(Books, Academic)",194.0,766.0,960.0,79.8
"(Books, Children)",192.0,838.0,1030.0,81.4
"(Books, Comics)",188.0,834.0,1022.0,81.6
"(Books, DIY)",160.0,825.0,985.0,83.8
"(Books, Fiction)",202.0,839.0,1041.0,80.6
"(Books, Non-Fiction)",170.0,831.0,1001.0,83.0
"(Clothing, Kids)",166.0,818.0,984.0,83.1
"(Clothing, Mens)",178.0,785.0,963.0,81.5


In [141]:
cvr_subcat

Unnamed: 0_level_0,confirmed,cancel,confirm,total,CVR(%)
prod_cat,prod_subcat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bags,Mens,188,809,997,81.1
Bags,Women,204,784,988,79.4
Books,Academic,194,766,960,79.8
Books,Children,192,838,1030,81.4
Books,Comics,188,834,1022,81.6
Books,DIY,160,825,985,83.8
Books,Fiction,202,839,1041,80.6
Books,Non-Fiction,170,831,1001,83.0
Clothing,Kids,166,818,984,83.1
Clothing,Mens,178,785,963,81.5


In [150]:
new_data.pivot_table(index=['prod_cat', 'prod_subcat'], columns="confirmed", values= "transaction_id", aggfunc=["count", "sum"])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,count,sum,sum
Unnamed: 0_level_1,confirmed,cancel,confirm,cancel,confirm
prod_cat,prod_subcat,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Bags,Mens,188,809,9075657447336,40523265674638
Bags,Women,204,784,10429418873878,38784878884432
Books,Academic,194,766,10231076114274,36840140450398
Books,Children,192,838,9504809291572,41502988349769
Books,Comics,188,834,10025758756900,41694496428249
Books,DIY,160,825,8115648913310,41393664410179
Books,Fiction,202,839,10728660905298,42502498613104
Books,Non-Fiction,170,831,8551694929606,40934663253062
Clothing,Kids,166,818,7037544018444,39922092593539
Clothing,Mens,178,785,10249611175316,39370609724924


In [147]:
new_data

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type,order_status,confirmed,key,chk,prod_cat,prod_subcat
0,8410316370,266783,2013-02-20,4,1,1,869,91.245,960.245,e-Shop,order,confirm,8410316370order,1,Clothing,Mens
1,16999552161,266783,2013-02-09,10,5,2,835,175.350,1845.350,e-Shop,order,confirm,16999552161order,1,Books,Non-Fiction
2,25890929042,266783,2011-09-23,1,2,4,1321,554.820,5838.820,e-Shop,order,cancel,25890929042order,1,Footwear,Mens
3,25890929042,266783,2011-09-24,1,2,-4,-1321,554.820,-5838.820,e-Shop,cancel,cancel,25890929042cancel,1,Footwear,Mens
4,98477711300,266783,2012-10-21,4,1,3,93,29.295,308.295,TeleShop,order,confirm,98477711300order,1,Clothing,Mens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22928,81382444243,275264,2011-10-08,12,6,4,587,246.540,2594.540,e-Shop,order,confirm,81382444243order,1,Home and kitchen,Tools
22929,94712826085,275264,2011-08-05,10,5,5,221,116.025,1221.025,TeleShop,order,confirm,94712826085order,1,Books,Non-Fiction
22930,7214136016,275265,2011-12-17,1,4,1,222,23.310,245.310,TeleShop,order,confirm,7214136016order,1,Bags,Mens
22931,24113900219,275265,2013-04-03,2,6,3,719,226.485,2383.485,Flagship store,order,confirm,24113900219order,1,Home and kitchen,Furnishing
