In [1]:
import pandas as pd
#경로는 꼭 새롭게 설정 후 데이터가 가져와졌는지 확인 먼저 필요함
articles = pd.read_csv("../../data/articles_hm.csv")
customers = pd.read_csv("../../data/customer_hm.csv")
transactions = pd.read_csv("../../data/transactions_hm.csv")

In [3]:
#2. 원본 데이터 지키기 위해서 copy
df_cust = customers.copy()
df_tran = transactions.copy()
df_art = articles.copy()

In [22]:
transactions["sales_channel_id"].value_counts(dropna=False)

sales_channel_id
2    729192
1    319383
Name: count, dtype: int64

In [4]:
#3.join을 위해 동일한 타입으로 변환
df_tran['customer_id']= df_tran['customer_id'].astype(str)
df_cust['customer_id']= df_cust['customer_id'].astype(str)

df_tran['article_id']= df_tran['article_id'].astype(str)
df_art['article_id'] = df_art['article_id'].astype(str)


In [5]:
#4. 데이터 크기 및 중복 확인

print("df_tran (복사본) 크기:", df_tran.shape)
print("중복 데이터 수:", df_tran.duplicated().sum())

df_tran (복사본) 크기: (1048575, 5)
중복 데이터 수: 8474


In [6]:
#5.중복된 거래 로그를 제거하여 분석 결과의 왜곡을 방지
df_tran.drop_duplicates(inplace=True)

In [7]:
df_tran.duplicated().sum()

np.int64(0)

In [8]:
#6. 날짜 타입 변환
df_tran['t_dat'] = pd.to_datetime(df_tran['t_dat'], format='%Y-%m-%d')


In [9]:
df_tran.info()

<class 'pandas.DataFrame'>
Index: 1040101 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   t_dat             1040101 non-null  datetime64[us]
 1   customer_id       1040101 non-null  str           
 2   article_id        1040101 non-null  str           
 3   price             1040101 non-null  float64       
 4   sales_channel_id  1040101 non-null  int64         
dtypes: datetime64[us](1), float64(1), int64(1), str(2)
memory usage: 47.6 MB


In [10]:
#7. 판매 채널 컬럼 생성 및 값 변환
df_tran["channel"] = df_tran["sales_channel_id"].map({1: "오프라인", 2: "온라인"})
df_tran["channel"].value_counts() 
numbs = df_tran["channel"].value_counts()
print("온라인:", f"{numbs['온라인']:,}")
print("오프라인:", f"{numbs['오프라인']:,}")
print("전체 거래 건수:", f"{df_tran.shape[0]:,}")

온라인: 721,488
오프라인: 318,613
전체 거래 건수: 1,040,101


In [11]:
df_tran

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,channel
0,2019-11-05,3e2b60b679e62fb49516105b975560082922011dd752ec...,698328010,0.016932,2,온라인
1,2019-05-22,89647ac2274f54c770aaa4b326e0eea09610c252381f37...,760597002,0.033881,2,온라인
2,2019-05-10,2ebe392150feb60ca89caa8eff6c08b7ef1138cd6fdc71...,488561032,0.016932,2,온라인
3,2019-08-26,7b3205de4ca17a339624eb5e3086698e9984eba6b47c56...,682771001,0.033881,2,온라인
4,2019-08-10,3b77905de8b32045f08cedb79200cdfa477e9562429a39...,742400033,0.003220,1,오프라인
...,...,...,...,...,...,...
1048569,2019-01-23,9a34b77f0d5f7aa03ff810e47fdf64be1557cf9fb0fe87...,577675001,0.023712,2,온라인
1048570,2019-03-25,8e900818a1da0e73300a012794615993fbefb43f851430...,766346001,0.025407,2,온라인
1048571,2019-10-18,13aa106eeb8a6fa2f9e2ab888a3a2ee118a3900670f0d8...,811905001,0.016932,2,온라인
1048572,2019-04-06,5f51e21397c1248aafdf9712435bd4be063eb093f56229...,690108001,0.021729,2,온라인


In [12]:
#8. 가격 데이터 분포 확인
df_tran['price'].describe() 
print("price가 0.1 이상:", len(df_tran[df_tran["price"] >= 0.1]))
print("price가 0.2 이상:", len(df_tran[df_tran["price"] >= 0.2]))
print("price가 0.3 이상:", len(df_tran[df_tran["price"] >= 0.3]))
print("price가 0.4 이상:", len(df_tran[df_tran["price"] >= 0.4]))
print("price가 0.5 이상:", len(df_tran[df_tran["price"] >= 0.5]))

price가 0.1 이상: 10704
price가 0.2 이상: 719
price가 0.3 이상: 126
price가 0.4 이상: 29
price가 0.5 이상: 2


In [13]:
#9. 고가 거래 분리 및 채널 확인
high_price_transactions = df_tran[df_tran["price"] >= 0.4]
high_price_transactions['sales_channel_id'].replace({1: 'Offline', 2: 'Online'}).value_counts() 


sales_channel_id
Online    29
Name: count, dtype: int64

In [14]:
#10.월 컬럼 생성
df_tran['year_month'] = df_tran['t_dat'].dt.to_period('M') 

In [15]:
df_tran

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,channel,year_month
0,2019-11-05,3e2b60b679e62fb49516105b975560082922011dd752ec...,698328010,0.016932,2,온라인,2019-11
1,2019-05-22,89647ac2274f54c770aaa4b326e0eea09610c252381f37...,760597002,0.033881,2,온라인,2019-05
2,2019-05-10,2ebe392150feb60ca89caa8eff6c08b7ef1138cd6fdc71...,488561032,0.016932,2,온라인,2019-05
3,2019-08-26,7b3205de4ca17a339624eb5e3086698e9984eba6b47c56...,682771001,0.033881,2,온라인,2019-08
4,2019-08-10,3b77905de8b32045f08cedb79200cdfa477e9562429a39...,742400033,0.003220,1,오프라인,2019-08
...,...,...,...,...,...,...,...
1048569,2019-01-23,9a34b77f0d5f7aa03ff810e47fdf64be1557cf9fb0fe87...,577675001,0.023712,2,온라인,2019-01
1048570,2019-03-25,8e900818a1da0e73300a012794615993fbefb43f851430...,766346001,0.025407,2,온라인,2019-03
1048571,2019-10-18,13aa106eeb8a6fa2f9e2ab888a3a2ee118a3900670f0d8...,811905001,0.016932,2,온라인,2019-10
1048572,2019-04-06,5f51e21397c1248aafdf9712435bd4be063eb093f56229...,690108001,0.021729,2,온라인,2019-04


In [17]:
#11.월별 거래 건수 & 매출액 계산
monthly_sales = df_tran.groupby('year_month').size()
monthly_price = df_tran.groupby('year_month')['price'].sum()
print(monthly_price)

year_month
2019-01    2129.926131
2019-02    1989.217641
2019-03    2374.905504
2019-04    2703.443538
2019-05    2748.199469
2019-06    3088.776976
2019-07    2552.035334
2019-08    1943.422489
2019-09    2559.226862
2019-10    2358.486793
2019-11    2463.769270
2019-12    1985.492149
Freq: M, Name: price, dtype: float64


In [18]:
#12. 최고 매출 월 
best_month = monthly_price.idxmax()
best_value = monthly_price.max()
print("가장 매출이 높은 달:", best_month)
print("그 달의 매출:", best_value) 

가장 매출이 높은 달: 2019-06
그 달의 매출: 3088.776976142


In [21]:
df_tran["price"].min()

np.float64(0.000237288)

In [23]:
df_tran.shape

(1040101, 7)