## CJ더마켓 고객 주문 데이터를 활용한 프라임 회원 예측 모델링
- 팀명 : 대상이조
- 팀원 : 박시현, 손민규, 임성연, 정예린

### train 데이터 전처리

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('tmk_bda_train.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,F,2,Y,20230124,N
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,M,3,Y,20230124,N
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,F,4,N,20230125,N
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,F,4,N,20230126,Y
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,M,4,N,20230125,Y
...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,M,5,Y,20230102,Y
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,F,3,N,20230102,N
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,F,4,N,20230101,N
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,M,4,Y,20230101,Y


In [3]:
# 결측값 확인
df.isnull().sum()

scd              0
product_name     0
net_order_qty    0
net_order_amt    0
gender           0
age_grp          0
employee_yn      0
order_date       0
prime_yn         0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45875 entries, 0 to 45874
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   scd            45875 non-null  int64  
 1   product_name   45875 non-null  object 
 2   net_order_qty  45875 non-null  int64  
 3   net_order_amt  45875 non-null  float64
 4   gender         45875 non-null  object 
 5   age_grp        45875 non-null  int64  
 6   employee_yn    45875 non-null  object 
 7   order_date     45875 non-null  int64  
 8   prime_yn       45875 non-null  object 
dtypes: float64(1), int64(4), object(4)
memory usage: 3.2+ MB


In [5]:
# 프라임 회원 유무 인코딩 (1 = 프라임 회원, 0 = 일반 회원)
map_prime = {'N':0, 'Y':1}
df['prime_yn'] = df['prime_yn'].map(map_prime)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,F,2,Y,20230124,0
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,M,3,Y,20230124,0
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,F,4,N,20230125,0
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,F,4,N,20230126,1
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,M,4,N,20230125,1
...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,M,5,Y,20230102,1
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,F,3,N,20230102,0
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,F,4,N,20230101,0
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,M,4,Y,20230101,1


In [6]:
# 성별 인코딩 (1 = Male, 0 = Female)
map_gender = {'F':0, 'M':1}
df['gender'] = df['gender'].map(map_gender)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1
...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1


In [7]:
# 구매일자 인코딩
df['weekday'] = df['order_date'].apply(lambda x: pd.to_datetime(str(x), format='%Y-%m-%d'))
df['weekday'] = df['weekday'].dt.weekday
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,2
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,3
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,2
...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,0
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,0
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,6
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,6


In [8]:
df['weekday'].value_counts()

6    12022
0     8246
3     7707
1     5675
4     4786
5     4638
2     2801
Name: weekday, dtype: int64

In [9]:
# 1월 23일 월, 1월 24일 화 : 설 연휴
df['weekday'] = np.where((df['weekday']==5) | (df['weekday']==6) | (df['order_date']==2023-1-23) | (df['order_date']==2023-1-24), 0, 1)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1
...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0


In [10]:
# 평일 / 주말 및 공휴일로 분류 (1 = 평일, 0 = 주말 및 공휴일)
df['weekday'].value_counts()

1    29215
0    16660
Name: weekday, dtype: int64

In [11]:
# 이벤트 할인 제품 총 구매 횟수 (event_product)
# '['가 포함되어 있는 제품
# 식물성, 냉동, 배송, 유산균, 눈건강, 피부건강, 1BOX는 이벤트에서 제외
df['event_product'] = np.where((df['product_name'].str.contains(r'\[')) & (-df['product_name'].str.contains('식물성|냉동|배송|유산균|눈건강|피부건강|1BOX|쿡킷')), 1, 0)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday,event_product
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1,0
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1,0
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1,0
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1,0
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1,0
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1,1
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0,1
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0,0


In [12]:
# 설 연휴 데이터 제거
#df = df[~df['product_name'].str.contains('23설', na=False, case=False)]
#df

In [13]:
# cj더마켓 제품 크롤링
cj = pd.read_csv('cj_category.csv')
cj = cj.drop('Unnamed: 0', axis=1)
cj = cj.drop_duplicates()
cj.columns = ['category', 'product_name']
cj

Unnamed: 0,category,product_name
0,0,[생산직송]비비고 포기배추김치 5kg+총각김치 2kg
1,0,[생산직송]비비고 포기김치 5kg+열무김치 900gX2개
2,0,비비고 순살 고등어구이 60g
3,0,비비고 소고기 미역국 500gx18개(1box)
4,0,비비고 소고기 미역국 500gX6개
...,...,...
1722,11,크레잇 베이컨 1kg
1723,11,요리당5kg
1724,11,크레잇 햄야채볶음밥 280g
1725,11,쉐프솔루션 고기팡팡 미트볼1kg


- 0 = 국/김치/김/반찬/두부
- 1 = 스팸/닭가슴살/소시지
- 2 = 신선식품
- 3 = 만두/피자/치킨
- 4 = 핫도그/떡볶이/간식
- 5 = 돈까스/함박/구이
- 6 = 밥/죽/면
- 7 = 밀키트
- 8 = 건강식품
- 9 = 음료/생수/시럽
- 10 = 양념/소스/가루/오일
- 11 = 대용량 식자제

In [14]:
# 중복된 카테고리로 분류되어 있는 제품들이 있음을 확인
cj['product_name'].value_counts()

프레시웨이 간식/아이스크림 골라담기            3
백설 하얀설탕 5kg                    2
울트라레귤러컷 냉동감자 2kg               2
고메 삼선해물볶음밥 420g                2
해찬들 100% 우리쌀 매운 태양초 고추장 3kg    2
                              ..
비비고 청양고추 찐만두 392gX2개           1
고메 미니치즈너겟 400g                 1
비비고 왕만두490gx2개                 1
비비고 청양고추 찐만두 168g              1
크레잇 생활반찬 고추송송 고기말이1kg          1
Name: product_name, Length: 1667, dtype: int64

In [15]:
cj = cj.astype({'category': 'string'})
cj.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1724 entries, 0 to 1726
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   category      1724 non-null   string
 1   product_name  1724 non-null   object
dtypes: object(1), string(1)
memory usage: 40.4+ KB


In [16]:
# 이후 category 변수를 표현하기 위해 중복 카테고리 정보도 버리지 않음
cj = cj.groupby('product_name').agg({'category': lambda x: ",".join(set(x))})
cj

Unnamed: 0_level_0,category
product_name,Unnamed: 1_level_1
(냉동) 비비고 테이블 본갈비탕 700g,0
(냉동) 비비고 테이블 본갈비탕 700gX2개 + 특설렁탕 700gX2개 + 특양지곰탕 700gX2개,0
(냉동) 비비고 테이블 본갈비탕 700gX3개,0
(냉동) 비비고 테이블 특 선물세트 (본갈비탕700g + 특양지곰탕700g + 특설렁탕700g),0
(냉동) 비비고 테이블 특설렁탕 700g,0
...,...
헬씨누리 침향환 환심 10환,8
헬씨누리 침향환 환심 10환X6입(1BOX),8
호도과자용가루5 11kg,11
훈제대란 20구,0


In [17]:
cj_new = pd.merge(df, cj, on='product_name', how='left')
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday,event_product,category
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1,0,
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1,0,1
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1,0,3
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1,0,3
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1,0,
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1,1,
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0,1,
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0,0,5


In [18]:
cj_new.isnull().sum()

scd                  0
product_name         0
net_order_qty        0
net_order_amt        0
gender               0
age_grp              0
employee_yn          0
order_date           0
prime_yn             0
weekday              0
event_product        0
category         21904
dtype: int64

In [19]:
# 결측치 12로 대체
cj_new = cj_new.fillna('12')

In [20]:
# 배정되지 않은 카테고리에 대하여 cj더마켓 제품들을 고려하여 리스트 생성
# 리스트로 잘 분류될 수 있도록 list_10, 6, 8는 분류 순서 고려
# list_10를 첫번째로
list_10 = ['포도씨유', '양념', '올리브유', '쌈장', '소스', '다진마늘', '고추장', '참치액', '가루', '기름', '맛술',
          '드레싱', '시즈닝', '카놀라유', '간장', '올리브유', '다시마', '요리유', '비빔장', '된장', '식초', '다시다',
          '올리고당', '설탕', '액젓', '믹스', '천일염', '참깨', '굴소스', '가루', '옥수수유', '해바라기씨유', '쌀엿']
# list_6를 두번째로
list_6 = ['햇반', '죽', '냉면', '짬뽕', '동치미', '메밀', '우동', '잡채', '짜장', '밀면', '파스타', '쌀국수', 
          '야끼소바', '콩국수', '소면', '당면', '죽', '스파게티', '덮밥', '쫄면', '수프', '마라탕', '리조또', 
          '떡볶이떡', '칼국수', '뇨끼', '밸런스밀', '병아리콩']
# list_8를 세번째로
list_8 = ['오메가', '루테인', '유산균', '영양', '전립소', '비타민', '팻다운', '리턴업', '바이오코어', '홍삼', '흑삼',
          '콜라겐', '멀티', '배도라지', '양배추', '한뿌리', '닥터뉴트리', '환심']

list_0 = ['김치', '구이', '탕', '미역국', '육개장', '국물', '두부', '찌개', '무국', '국', '김', '유부', '단무지', 
          '메추리알', '쌈무', '볶음', '어묵', '조림', '밥이랑', '육수', '조림', '마늘', '란', '오이지', '장아찌']
list_1 = ['닭가슴살', '햄', '베이컨', '후랑크', '비엔나', '스팸', '통삼겹', '소시지', '킬바사', '더블에이징', '부어스트','']
list_2 = ['오리', '배', '고구마', '망고', '블루베리', '오렌지', '양갈비', '육포', '랍스터', '삼겹살', '목심살', '사과', 
          '토마토', '키위', '매실', '항정살', '한우', '청도', '스위트콘', '대추', '명란젓', '새우튀김', '황도', '양파', 
          '갈비','감자', '한돈', '날개', '닭', '야자', '버섯', '멜론']
list_3 = ['피자', '치킨', '교자', '순살', '너겟', '만두', '탕수육', '봉', '구이', '윙', '칠리새우', '깐풍기', 
          '닭강정', '새우튀김']
list_4 = ['츄러스', '밤', '핫도그', '쁘띠첼', '떡볶이' '바삭칩', '맥스봉', '핫랩', '면볶이', '고메 베이커리', 
          '냉동감자', '팝콘', '찹쌀떡', '스크림', '아몬드']
list_5 = ['스테이크', '카츠', '미트볼', '떡갈비', '동그랑땡', '까스', '불고기', '완자', '너비아니', '고기말이', '바베큐바']
list_7 = ['쿡킷', '밀키트']
list_9 = ['아이시스', '트레비', '삼다수', '콜라', '석류', '토레타', '컨디션', '사이다', '얼티브', 'W차', '파우더', '청', 
          '미초', '커피', '시럽', '식혜', '아이누리', '플리또', '메티에', '보리']
list_11 = ['15kg', '3.6L', '5kg', '1.4kg', '1.5L', '1.25kg', '3kg', '20kg', '6.5kg', '14kg', '1.45kg', '2.45kg', 
           '11kg', '9kg', '25kg']

In [21]:
list_10 = '|'.join(list_10)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_10)), 'category'] = '10'

In [22]:
list_6 = '|'.join(list_6)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_6)), 'category'] = '6'

In [23]:
list_8 = '|'.join(list_8)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_8)), 'category'] = '8'

In [24]:
list_0 = '|'.join(list_0)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_0)), 'category'] = '0'

In [25]:
list_1 = '|'.join(list_1)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_1)), 'category'] = '1'

In [26]:
list_2 = '|'.join(list_2)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_2)), 'category'] = '2'

In [27]:
list_3 = '|'.join(list_3)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_3)), 'category'] = '3'

In [28]:
list_4 = '|'.join(list_4)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_4)), 'category'] = '4'

In [29]:
list_5 = '|'.join(list_5)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_5)), 'category'] = '5'

In [30]:
list_7 = '|'.join(list_7)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_7)), 'category'] = '7'

In [31]:
list_9 = '|'.join(list_9)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_9)), 'category'] = '9'

In [32]:
list_11 = '|'.join(list_11)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_11)), 'category'] = '11'
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday,event_product,category
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1,0,1
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1,0,1
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1,0,3
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1,0,3
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1,0,1
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1,1,10
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0,1,10
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0,0,5


In [33]:
# 12로 대체했던 결측치가 모두 사라짐
cj_new['category'].value_counts()

0        11147
6        10012
1         9824
10        5419
3         3563
4         1820
5         1682
8         1210
3,6        357
9          350
11         204
2           87
0,4         57
11,10       54
11,0        29
11,4        26
6,4         15
11,1         9
9,8          7
3,2          3
Name: category, dtype: Int64

In [34]:
# 이후 num_category 변수 생성을 위해 변수 추가
cj_new['num_category'] = cj_new['category']

카테고리 축약
- 요리 : 국/김치/김/반찬/두부, 스팸/닭가슴살/소시지, 신선식품 (0, 1, 2)
- 간식 : 만두/피자/치킨, 핫도그/떡볶이/간식, 돈까스/함박/구이 (3, 4, 5)
- 식사 : 밥/죽/면, 밀키트 (6, 7)
- 기타 : 건강식품, 음료/생수/시럽, 양념/소스/가루/오일, 대용량 식자재 (8, 9, 10, 11)

In [35]:
# 카테고리를 축약하기 위해 0-11로 표현되어 있던 카테고리를 0-3으로 표현
# 앞서 중복 카테고리를 포함하고 있던 데이터도 분류
def cate(x):
    if x=='0' or x=='1' or x=='2' : return '0'
    elif x=='3' or x=='4' or x=='5' : return '1'
    elif x=='6' or x=='7' : return '2'
    elif x=='6,3' or x=='6,4' : return '1,2'
    elif x=='0,4' or x=='3,2' : return '0,1'
    elif x=='10,11' or x=='8,9' : return '3'
    elif x=='0,11' or x=='11,1' : return '0,3'
    elif x=='11,4' : return '1,3'
    else : return '3'

In [36]:
cj_new['category'] = cj_new['category'].apply(cate)
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday,event_product,category,num_category
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1,0,0,1
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1,0,0,1
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1,0,1,3
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1,0,1,3
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1,0,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1,0,0,1
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1,1,3,10
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0,1,3,10
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0,0,1,5


In [37]:
# amt/qty 변수 생성
cj_new['amt/qty'] = cj_new['net_order_amt']/cj_new['net_order_qty']
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn,weekday,event_product,category,num_category,amt/qty
0,20230124153976,잔칫집 식혜 240ml 30입,1,9.803170,0,2,Y,20230124,0,1,0,0,1,9.803170
1,20230124155563,백설 한입쏙 비엔나 120g*2,1,8.256607,1,3,Y,20230124,0,1,0,0,1,8.256607
2,20230125158386,비비고 왕교자 1.05kg,1,9.348449,0,4,N,20230125,0,1,0,1,3,9.348449
3,20230126164638,고메 바삭쫄깃한 탕수육 900g,1,9.667259,0,4,N,20230126,1,1,0,1,3,9.667259
4,20230125159705,햇반 매일잡곡밥210g,20,9.994653,1,4,N,20230125,1,1,0,2,6,0.499733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,고메 거멍 모짜체다핫도그 340g,1,8.648397,1,5,Y,20230102,1,1,0,0,1,8.648397
45871,20230102972720,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,3,N,20230102,0,1,1,3,10,9.639327
45872,20230101964953,[앱전용특가]비비고 차돌된장찌개 460gX4개,1,9.639327,0,4,N,20230101,0,0,1,3,10,9.639327
45873,20230101970142,[식물성]고메 플랜테이블 함박스테이크 150g,1,7.939872,1,4,Y,20230101,1,0,0,1,5,7.939872


In [38]:
cj_new = cj_new.rename(columns={'net_order_qty': 'total_qty', 'net_order_amt': 'total_amt'})

In [39]:
cj_new = cj_new[['scd', 'gender', 'age_grp', 'weekday', 'total_qty', 'total_amt', 'category', 'event_product', 'num_category', 'amt/qty', 'prime_yn', 'employee_yn']]
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn
0,20230124153976,0,2,1,1,9.803170,0,0,1,9.803170,0,Y
1,20230124155563,1,3,1,1,8.256607,0,0,1,8.256607,0,Y
2,20230125158386,0,4,1,1,9.348449,1,0,3,9.348449,0,N
3,20230126164638,0,4,1,1,9.667259,1,0,3,9.667259,1,N
4,20230125159705,1,4,1,20,9.994653,2,0,6,0.499733,1,N
...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,1,5,1,1,8.648397,0,0,1,8.648397,1,Y
45871,20230102972720,0,3,1,1,9.639327,3,1,10,9.639327,0,N
45872,20230101964953,0,4,0,1,9.639327,3,1,10,9.639327,0,N
45873,20230101970142,1,4,0,1,7.939872,1,0,5,7.939872,1,Y


- scd : 주문번호
- gender : 성별 (1 = Male; 0 = Female)
- age_grp : 나이대 (1 = 10대; 2 = 20대; 3 = 30대; 4 = 40대; 5 = 50대; 6 = 60대)
- weekday : 주문일자 (1 = 평일; 0 = 주말 및 공휴일)
- total_qty : 총 주문 수량
- total_amt : 총 주문 금액 (정규화)
- category : 구매한 카테고리 (0 = 요리; 1 = 간식; 2 = 식사; 3 = 기타)
- event_product : 이벤트 할인 제품 총 구매 횟수
- num_category : 구매 제품의 카테고리 개수
- amt/qty : 구매 수량 대비 구매 금액
- prime_yn : 프라임 회원 유무 (1 = 프라임 회원; 0 = 일반 회원)

In [40]:
cj_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45875 entries, 0 to 45874
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   scd            45875 non-null  int64  
 1   gender         45875 non-null  int64  
 2   age_grp        45875 non-null  int64  
 3   weekday        45875 non-null  int32  
 4   total_qty      45875 non-null  int64  
 5   total_amt      45875 non-null  float64
 6   category       45875 non-null  object 
 7   event_product  45875 non-null  int32  
 8   num_category   45875 non-null  string 
 9   amt/qty        45875 non-null  float64
 10  prime_yn       45875 non-null  int64  
 11  employee_yn    45875 non-null  object 
dtypes: float64(2), int32(2), int64(5), object(2), string(1)
memory usage: 4.2+ MB


In [41]:
cj_new = cj_new.astype({'category': 'string', 'num_category':'string'})

In [42]:
# category 변수를 리스트로 표현
temp=[]
for i in range(len(cj_new)):
    str = cj_new['category'].iloc[i]
    temp.append(str.split(','))
    print(str.split(','))

['0']
['0']
['1']
['1']
['2']
['2']
['2']
['1']
['1']
['2']
['0']
['1']
['2']
['1']
['1']
['0']
['1']
['3']
['3']
['1']
['3']
['1']
['2']
['3']
['0']
['0']
['2']
['0']
['0']
['2']
['2']
['1']
['2']
['1']
['1']
['1']
['2']
['0']
['2']
['1']
['0']
['1']
['0']
['1']
['1']
['1']
['0']
['0']
['0']
['1']
['1']
['0']
['0']
['3']
['0']
['0']
['3']
['0']
['1']
['3']
['1']
['1']
['0']
['1']
['3']
['0']
['3']
['0']
['0']
['3']
['0']
['0']
['0']
['3']
['2']
['0']
['2']
['3']
['2']
['0']
['1']
['0']
['1']
['2']
['3']
['3']
['3']
['3']
['1']
['1']
['2']
['0']
['0']
['2']
['1']
['3']
['1']
['2']
['3']
['0']
['0']
['0']
['0']
['3']
['3']
['3']
['1']
['2']
['1']
['0']
['1']
['0']
['0']
['2']
['1']
['1']
['1']
['0']
['3']
['2']
['0']
['2']
['2']
['0']
['0']
['1']
['3']
['0']
['1']
['3']
['3']
['3']
['3']
['3']
['0']
['0']
['1']
['1']
['0']
['1']
['0']
['1']
['2']
['0']
['1']
['0']
['0']
['3']
['3']
['3']
['2']
['0']
['1']
['0']
['1']
['0']
['1']
['0']
['3']
['0']
['3']
['1']
['2']
['1']
['1']
['1']
['1'

In [43]:
ctg_temp = pd.DataFrame({
    'ctg' : temp
})
ctg_temp

Unnamed: 0,ctg
0,[0]
1,[0]
2,[1]
3,[1]
4,[2]
...,...
45870,[0]
45871,[3]
45872,[3]
45873,[1]


In [44]:
cj_new['category'] = ctg_temp['ctg']
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn
0,20230124153976,0,2,1,1,9.803170,[0],0,1,9.803170,0,Y
1,20230124155563,1,3,1,1,8.256607,[0],0,1,8.256607,0,Y
2,20230125158386,0,4,1,1,9.348449,[1],0,3,9.348449,0,N
3,20230126164638,0,4,1,1,9.667259,[1],0,3,9.667259,1,N
4,20230125159705,1,4,1,20,9.994653,[2],0,6,0.499733,1,N
...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,1,5,1,1,8.648397,[0],0,1,8.648397,1,Y
45871,20230102972720,0,3,1,1,9.639327,[3],1,10,9.639327,0,N
45872,20230101964953,0,4,0,1,9.639327,[3],1,10,9.639327,0,N
45873,20230101970142,1,4,0,1,7.939872,[1],0,5,7.939872,1,Y


In [45]:
for i in range(len(cj_new)):
    cj_new['category'].iloc[i] = list(set(cj_new['category'].iloc[i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_new['category'].iloc[i] = list(set(cj_new['category'].iloc[i]))


In [46]:
cj_new['category'].value_counts()

[0]       21058
[2]       10012
[3]        7630
[1]        7065
[1, 0]       60
[3, 1]       26
[2, 1]       15
[3, 0]        9
Name: category, dtype: int64

In [47]:
for i in range(len(cj_new)):
    cj_new['category'].iloc[i] = ','.join(s for s in cj_new['category'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_new['category'].iloc[i] = ','.join(s for s in cj_new['category'].iloc[i])


In [48]:
cj_new = cj_new.astype({'category': 'string'})

In [49]:
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn
0,20230124153976,0,2,1,1,9.803170,0,0,1,9.803170,0,Y
1,20230124155563,1,3,1,1,8.256607,0,0,1,8.256607,0,Y
2,20230125158386,0,4,1,1,9.348449,1,0,3,9.348449,0,N
3,20230126164638,0,4,1,1,9.667259,1,0,3,9.667259,1,N
4,20230125159705,1,4,1,20,9.994653,2,0,6,0.499733,1,N
...,...,...,...,...,...,...,...,...,...,...,...,...
45870,20230102972321,1,5,1,1,8.648397,0,0,1,8.648397,1,Y
45871,20230102972720,0,3,1,1,9.639327,3,1,10,9.639327,0,N
45872,20230101964953,0,4,0,1,9.639327,3,1,10,9.639327,0,N
45873,20230101970142,1,4,0,1,7.939872,1,0,5,7.939872,1,Y


In [50]:
# 주문번호 별로 합치기
# gender, age_grp, weekday, prim_yn, employee_yn -> 동일한 값
# total_qty, total_amt, event_product -> sum
# category -> 카테고리 리스트
# num_category -> nunique
# amt/qty -> mean
cj_grp = cj_new.groupby('scd').agg({'gender': pd.Series.mode,
                                   'age_grp': pd.Series.mode,
                                   'weekday': pd.Series.mode,
                                   'total_qty': 'sum',
                                   'total_amt': 'sum',
                                   'category': lambda x: ",".join(set(x)),
                                   'event_product': 'sum',
                                   'num_category': pd.Series.nunique,
                                   'amt/qty': 'mean', 
                                   'prime_yn': pd.Series.mode,
                                   'employee_yn': pd.Series.mode})
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20230101963226,1,3,0,6,39.433806,2,2,1,7.609629,0,Y
20230101963235,0,4,0,2,19.234566,0,2,2,9.617283,1,Y
20230101963244,0,4,0,3,10.234373,1,0,1,3.411458,1,N
20230101963247,1,3,0,6,45.516855,10,0,3,8.070212,1,Y
20230101963251,0,3,0,2,10.845621,3,0,1,5.422811,1,Y
...,...,...,...,...,...,...,...,...,...,...,...
20230131216756,0,4,1,3,18.314758,1,0,2,6.796144,0,N
20230131216771,0,4,1,2,9.125762,0,0,1,4.562881,1,N
20230131216842,1,3,1,1,10.176411,2,1,1,10.176411,0,N
20230131216844,0,4,1,8,44.927942,310,0,3,6.307187,0,N


In [51]:
# category 변수를 각 주문번호 당 구매 카테고리 여부를 0/1로 표현하는 ctg_0, 1, 2, 3 변수로 표현
cj_grp['category'].value_counts()

0                  2996
2                  1204
2,0                1055
3                  1042
3,0                 699
1,0                 469
1                   460
2,3,0               432
3,2,0               350
2,1,0               235
2,3,1,0             230
1,2,3,0             215
1,2,0               209
3,2                 188
1,3,2,0             181
3,1,0               139
1,3,0               119
2,1                 111
3,1                  84
1,2                  70
2,3,1                53
1,3,2                34
1,2,1,0,0            10
2,1,1,0,3,0           9
2,1,3,1,3,0           5
3,1,1,0               5
1,3,1,0               4
3,1,0,0               4
3,1,1                 3
0,2,1                 3
2,1,0,1,0             3
3,0,3,0               2
2,1,2,1,3,0           2
3,1,0,2,0             2
2,1,0,2,1             2
1,3,1,2               2
2,1,0,1,3,0           2
3,0,3,1,0             1
1,2,1,0               1
1,1,0,2               1
1,1,0,3,1,3,0         1
3,1,2,0         

In [52]:
cj_grp['num_category'].value_counts()

1     5026
2     2214
3     1577
4     1020
5      478
6      219
7       97
8       19
9        1
11       1
10       1
Name: num_category, dtype: int64

In [53]:
for i in range(len(cj_grp)):
    str = cj_grp['category'].iloc[i]
    cj_grp['category'].iloc[i] = ','.join(dict.fromkeys(str))
    print(cj_grp['category'].iloc[i]) # abc

2
0
1
1,,,0
3
1,,,0
3
1,,,0
3
3
2,,,3,1,0
1,,,2,3,0
2
3
1,,,2,0
1,,,2,3,0
0
2,,,1
1
1,,,2,3,0
2,,,3,0
3
3
1,,,2
1,,,3,2
2,,,0
3,,,2,0
2,,,1,0
2,,,3,1,0
0
0
0
0
3,,,2,0
1,,,2,0
1,,,3,2,0
0
0
3,,,1,0
1
3,,,0
1,,,2,0
0
2
3,,,2
1,,,2,0
1,,,2,0
2,,,3,1,0
1,,,2,0
2,,,0
1,,,2,3,0
3,,,2,0
3
2,,,1,0
0
0
1,,,2,0
3
3,,,2,0
1,,,2,0
1,,,2,3,0
2
1
2,,,0
1
2
2,,,3,1,0
1,,,3,2,0
1,,,0
2
0
1,,,2,0
3
0
2,,,0
1,,,3,2,0
0
1,,,0
0
2
1,,,0
2
0
2
1
1,,,3,0
2
2,,,1
1,,,0
1,,,0
0
3,,,0
0
1,,,3,2,0
2,,,3,1
1,,,0
2,,,1,0
3,,,2,0
0
0
3,,,2
3,,,0
0
0
1
2
3,,,0
2,,,0
2
2
1
2
1,,,2,3,0
2,,,0
2,,,3,1
3,,,1,0
0
2,,,3,1
1,,,3,0
1,,,0
3
1,,,3,0
1
1,,,2,0
2,,,3,1,0
1,,,2,3,0
2,,,1,0
3,,,0
2,,,3,1,0
2,,,3,1,0
2
3
0
2,,,1,0
2
1,,,2,0
0
0
0
1,,,3,2
1,,,2,3,0
1,,,2,3,0
0
0
1
3,,,1,0
1
2
3,,,0
1,,,0
0
0
0
1
3,,,0
1,,,2,0
0
3
2
2,,,1,0
2,,,0
2
3
0
3
2,,,1
3
1,,,3,2
0
3,,,1,0
3
1,,,3,0
1,,,3,2,0
0
0
1,,,2,3,0
2,,,3,1,0
3
2,,,0
2
1,,,0
2,,,0
1,,,3,2,0
0
1,,,3,2
1,,,3,2
2
1,,,2,3,0
3,,,0
2
1,,,2,3,0
0
2,,,1
3
1
1,,,2,0
3
1,,,3,2,

In [54]:
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20230101963226,1,3,0,6,39.433806,2,2,1,7.609629,0,Y
20230101963235,0,4,0,2,19.234566,0,2,2,9.617283,1,Y
20230101963244,0,4,0,3,10.234373,1,0,1,3.411458,1,N
20230101963247,1,3,0,6,45.516855,"1,,,0",0,3,8.070212,1,Y
20230101963251,0,3,0,2,10.845621,3,0,1,5.422811,1,Y
...,...,...,...,...,...,...,...,...,...,...,...
20230131216756,0,4,1,3,18.314758,1,0,2,6.796144,0,N
20230131216771,0,4,1,2,9.125762,0,0,1,4.562881,1,N
20230131216842,1,3,1,1,10.176411,2,1,1,10.176411,0,N
20230131216844,0,4,1,8,44.927942,"3,,,1,0",0,3,6.307187,0,N


In [55]:
cj_grp['ctg_0']=0
cj_grp['ctg_1']=0
cj_grp['ctg_2']=0
cj_grp['ctg_3']=0

In [56]:
ctg = ['0', '1', '2', '3']

for i in range(len(cj_grp)):
    for j in ctg:
        if j in cj_grp['category'].iloc[i]:
            col = 'ctg_' + j
            cj_grp[col].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_grp[col].iloc[i] = 1


In [57]:
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,prime_yn,employee_yn,ctg_0,ctg_1,ctg_2,ctg_3
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20230101963226,1,3,0,6,39.433806,2,2,1,7.609629,0,Y,0,0,1,0
20230101963235,0,4,0,2,19.234566,0,2,2,9.617283,1,Y,1,0,0,0
20230101963244,0,4,0,3,10.234373,1,0,1,3.411458,1,N,0,1,0,0
20230101963247,1,3,0,6,45.516855,"1,,,0",0,3,8.070212,1,Y,1,1,0,0
20230101963251,0,3,0,2,10.845621,3,0,1,5.422811,1,Y,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216756,0,4,1,3,18.314758,1,0,2,6.796144,0,N,0,1,0,0
20230131216771,0,4,1,2,9.125762,0,0,1,4.562881,1,N,1,0,0,0
20230131216842,1,3,1,1,10.176411,2,1,1,10.176411,0,N,0,0,1,0
20230131216844,0,4,1,8,44.927942,"3,,,1,0",0,3,6.307187,0,N,1,1,0,1


In [58]:
cj_grp.drop(['category'], axis = 1, inplace=True)

In [59]:
cj_grp = cj_grp[['gender', 'age_grp', 'weekday', 'total_qty', 'total_amt', 'event_product', 'num_category', 'ctg_0', 'ctg_1', 'ctg_2', 'ctg_3', 'amt/qty', 'prime_yn', 'employee_yn']]
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,prime_yn,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20230101963226,1,3,0,6,39.433806,2,1,0,0,1,0,7.609629,0,Y
20230101963235,0,4,0,2,19.234566,2,2,1,0,0,0,9.617283,1,Y
20230101963244,0,4,0,3,10.234373,0,1,0,1,0,0,3.411458,1,N
20230101963247,1,3,0,6,45.516855,0,3,1,1,0,0,8.070212,1,Y
20230101963251,0,3,0,2,10.845621,0,1,0,0,0,1,5.422811,1,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216756,0,4,1,3,18.314758,0,2,0,1,0,0,6.796144,0,N
20230131216771,0,4,1,2,9.125762,0,1,1,0,0,0,4.562881,1,N
20230131216842,1,3,1,1,10.176411,1,1,0,0,1,0,10.176411,0,N
20230131216844,0,4,1,8,44.927942,0,3,1,1,0,1,6.307187,0,N


In [60]:
# 임직원 데이터
cj_y = cj_grp[cj_grp['employee_yn']=='Y']
cj_y = cj_y.drop(['employee_yn'], axis = 1)

In [61]:
cj_y.to_csv('cj_empY_train.csv')

In [62]:
# 비임직원 데이터
cj_n = cj_grp[cj_grp['employee_yn']=='N']
cj_n = cj_n.drop(['employee_yn'], axis = 1)

In [63]:
cj_n.to_csv('cj_empN_train.csv')

### test 데이터 전처리

In [64]:
df = pd.read_csv('tmk_bda_test.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,prime_yn
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,M,4,N,20230101,
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,M,4,Y,20230101,
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,F,4,N,20230101,
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,F,3,N,20230102,
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,F,3,N,20230101,
...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,M,5,Y,20230103,
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,F,3,Y,20230102,
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,F,3,Y,20230103,
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,M,4,N,20230104,


In [65]:
df = df.drop('prime_yn', axis=1)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,M,4,N,20230101
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,M,4,Y,20230101
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,F,4,N,20230101
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,F,3,N,20230102
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,F,3,N,20230101
...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,M,5,Y,20230103
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,F,3,Y,20230102
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,F,3,Y,20230103
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,M,4,N,20230104


In [66]:
# 결측값 확인
df.isnull().sum()

scd              0
product_name     0
net_order_qty    0
net_order_amt    0
gender           0
age_grp          0
employee_yn      0
order_date       0
dtype: int64

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19660 entries, 0 to 19659
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   scd            19660 non-null  int64  
 1   product_name   19660 non-null  object 
 2   net_order_qty  19660 non-null  int64  
 3   net_order_amt  19660 non-null  float64
 4   gender         19660 non-null  object 
 5   age_grp        19660 non-null  int64  
 6   employee_yn    19660 non-null  object 
 7   order_date     19660 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 1.2+ MB


In [68]:
# 성별 인코딩 (1 = Male, 0 = Female)
map_gender = {'F':0, 'M':1}
df['gender'] = df['gender'].map(map_gender)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101
...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104


In [69]:
df = df.astype({'order_date': 'string'})

In [70]:
# 구매일자 인코딩
df['weekday'] = df['order_date'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df['weekday'] = df['weekday'].dt.weekday
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,6
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,6
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,6
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,0
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,6
...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,0
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,2


In [71]:
df['weekday'].value_counts()

6    6292
0    4768
1    3047
3    1730
2    1495
4    1249
5    1079
Name: weekday, dtype: int64

In [72]:
# 1월 23일 월, 1월 24일 화 : 설 연휴
df['weekday'] = np.where((df['weekday']==5) | (df['weekday']==6) | (df['order_date']==2023-1-23) | (df['order_date']==2023-1-24), 0, 1)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0
...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1


In [73]:
# 평일 / 주말 및 공휴일로 분류 (1 = 평일, 0 = 주말 및 공휴일)
df['weekday'].value_counts()

1    12289
0     7371
Name: weekday, dtype: int64

In [74]:
# 이벤트 할인 제품 총 구매 횟수 (event_product)
# '['가 포함되어 있는 제품
# 식물성, 냉동, 배송, 유산균, 눈건강, 피부건강, 1BOX는 이벤트에서 제외
df['event_product'] = np.where((df['product_name'].str.contains(r'\[')) & (-df['product_name'].str.contains('식물성|냉동|배송|유산균|눈건강|피부건강|1BOX|쿡킷')), 1, 0)
df

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday,event_product
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0,0
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0,0
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0,0
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1,0
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0,0
...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1,0
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1,0
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1,0
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1,0


In [75]:
# 설 연휴 데이터 제거
#df = df[~df['product_name'].str.contains('23설', na=False, case=False)]
#df

In [76]:
# cj더마켓 제품 크롤링
cj = pd.read_csv('cj_category.csv')
cj = cj.drop('Unnamed: 0', axis=1)
cj = cj.drop_duplicates()
cj.columns = ['category', 'product_name']
cj

Unnamed: 0,category,product_name
0,0,[생산직송]비비고 포기배추김치 5kg+총각김치 2kg
1,0,[생산직송]비비고 포기김치 5kg+열무김치 900gX2개
2,0,비비고 순살 고등어구이 60g
3,0,비비고 소고기 미역국 500gx18개(1box)
4,0,비비고 소고기 미역국 500gX6개
...,...,...
1722,11,크레잇 베이컨 1kg
1723,11,요리당5kg
1724,11,크레잇 햄야채볶음밥 280g
1725,11,쉐프솔루션 고기팡팡 미트볼1kg


- 0 = 국/김치/김/반찬/두부
- 1 = 스팸/닭가슴살/소시지
- 2 = 신선식품
- 3 = 만두/피자/치킨
- 4 = 핫도그/떡볶이/간식
- 5 = 돈까스/함박/구이
- 6 = 밥/죽/면
- 7 = 밀키트
- 8 = 건강식품
- 9 = 음료/생수/시럽
- 10 = 양념/소스/가루/오일
- 11 = 대용량 식자제

In [77]:
# 중복된 카테고리로 분류되어 있는 제품들이 있음을 확인
cj['product_name'].value_counts()

프레시웨이 간식/아이스크림 골라담기            3
백설 하얀설탕 5kg                    2
울트라레귤러컷 냉동감자 2kg               2
고메 삼선해물볶음밥 420g                2
해찬들 100% 우리쌀 매운 태양초 고추장 3kg    2
                              ..
비비고 청양고추 찐만두 392gX2개           1
고메 미니치즈너겟 400g                 1
비비고 왕만두490gx2개                 1
비비고 청양고추 찐만두 168g              1
크레잇 생활반찬 고추송송 고기말이1kg          1
Name: product_name, Length: 1667, dtype: int64

In [78]:
cj = cj.astype({'category': 'string'})
cj.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1724 entries, 0 to 1726
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   category      1724 non-null   string
 1   product_name  1724 non-null   object
dtypes: object(1), string(1)
memory usage: 40.4+ KB


In [79]:
# 이후 category 변수를 표현하기 위해 중복 카테고리 정보도 버리지 않음
cj = cj.groupby('product_name').agg({'category': lambda x: ",".join(set(x))})
cj

Unnamed: 0_level_0,category
product_name,Unnamed: 1_level_1
(냉동) 비비고 테이블 본갈비탕 700g,0
(냉동) 비비고 테이블 본갈비탕 700gX2개 + 특설렁탕 700gX2개 + 특양지곰탕 700gX2개,0
(냉동) 비비고 테이블 본갈비탕 700gX3개,0
(냉동) 비비고 테이블 특 선물세트 (본갈비탕700g + 특양지곰탕700g + 특설렁탕700g),0
(냉동) 비비고 테이블 특설렁탕 700g,0
...,...
헬씨누리 침향환 환심 10환,8
헬씨누리 침향환 환심 10환X6입(1BOX),8
호도과자용가루5 11kg,11
훈제대란 20구,0


In [80]:
cj_new = pd.merge(df, cj, on='product_name', how='left')
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday,event_product,category
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0,0,0
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0,0,
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0,0,
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1,0,0
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1,0,3
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1,0,10
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1,0,4
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1,0,5


In [81]:
cj_new.isnull().sum()

scd                 0
product_name        0
net_order_qty       0
net_order_amt       0
gender              0
age_grp             0
employee_yn         0
order_date          0
weekday             0
event_product       0
category         7015
dtype: int64

In [82]:
# 결측치 12로 대체
cj_new = cj_new.fillna('12')

In [83]:
# 배정되지 않은 카테고리에 대하여 cj더마켓 제품들을 고려하여 리스트 생성
# 리스트로 잘 분류될 수 있도록 list_11, 10, 2는 분류 순서 고려
# list_10를 첫번째로
list_10 = ['포도씨유', '양념', '올리브유', '쌈장', '소스', '다진마늘', '고추장', '참치액', '가루', '기름', '맛술',
          '드레싱', '시즈닝', '카놀라유', '간장', '올리브유', '다시마', '요리유', '비빔장', '된장', '식초', '다시다',
          '올리고당', '설탕', '액젓', '믹스', '천일염', '참깨', '굴소스', '가루', '옥수수유', '해바라기씨유', '쌀엿']
# list_6를 두번째로
list_6 = ['햇반', '죽', '냉면', '짬뽕', '동치미', '메밀', '우동', '잡채', '짜장', '밀면', '파스타', '쌀국수', 
          '야끼소바', '콩국수', '소면', '당면', '죽', '스파게티', '덮밥', '쫄면', '수프', '마라탕', '리조또', 
          '떡볶이떡', '칼국수', '뇨끼', '밸런스밀', '병아리콩']
# list_8를 세번째로
list_8 = ['오메가', '루테인', '유산균', '영양', '전립소', '비타민', '팻다운', '리턴업', '바이오코어', '홍삼', '흑삼',
          '콜라겐', '멀티', '배도라지', '양배추', '한뿌리', '닥터뉴트리', '환심']

list_0 = ['김치', '구이', '탕', '미역국', '육개장', '국물', '두부', '찌개', '무국', '국', '김', '유부', '단무지', 
          '메추리알', '쌈무', '볶음', '어묵', '조림', '밥이랑', '육수', '조림', '마늘', '란', '오이지', '장아찌']
list_1 = ['닭가슴살', '햄', '베이컨', '후랑크', '비엔나', '스팸', '통삼겹', '소시지', '킬바사', '더블에이징', '부어스트','']
list_2 = ['오리', '배', '고구마', '망고', '블루베리', '오렌지', '양갈비', '육포', '랍스터', '삼겹살', '목심살', '사과', 
          '토마토', '키위', '매실', '항정살', '한우', '청도', '스위트콘', '대추', '명란젓', '새우튀김', '황도', '양파', 
          '갈비','감자', '한돈', '날개', '닭', '야자', '버섯', '멜론']
list_3 = ['피자', '치킨', '교자', '순살', '너겟', '만두', '탕수육', '봉', '구이', '윙', '칠리새우', '깐풍기', 
          '닭강정', '새우튀김']
list_4 = ['츄러스', '밤', '핫도그', '쁘띠첼', '떡볶이' '바삭칩', '맥스봉', '핫랩', '면볶이', '고메 베이커리', 
          '냉동감자', '팝콘', '찹쌀떡', '스크림', '아몬드']
list_5 = ['스테이크', '카츠', '미트볼', '떡갈비', '동그랑땡', '까스', '불고기', '완자', '너비아니', '고기말이', '바베큐바']
list_7 = ['쿡킷', '밀키트']
list_9 = ['아이시스', '트레비', '삼다수', '콜라', '석류', '토레타', '컨디션', '사이다', '얼티브', 'W차', '파우더', '청', 
          '미초', '커피', '시럽', '식혜', '아이누리', '플리또', '메티에', '보리']
list_11 = ['15kg', '3.6L', '5kg', '1.4kg', '1.5L', '1.25kg', '3kg', '20kg', '6.5kg', '14kg', '1.45kg', '2.45kg', 
           '11kg', '9kg', '25kg']

In [84]:
list_10 = '|'.join(list_10)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_10)), 'category'] = '10'

In [85]:
list_6 = '|'.join(list_6)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_6)), 'category'] = '6'

In [86]:
list_8 = '|'.join(list_8)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_8)), 'category'] = '8'

In [87]:
list_0 = '|'.join(list_0)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_0)), 'category'] = '0'

In [88]:
list_1 = '|'.join(list_1)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_1)), 'category'] = '1'

In [89]:
list_2 = '|'.join(list_2)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_2)), 'category'] = '2'

In [90]:
list_3 = '|'.join(list_3)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_3)), 'category'] = '3'

In [91]:
list_4 = '|'.join(list_4)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_4)), 'category'] = '4'

In [92]:
list_5 = '|'.join(list_5)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_5)), 'category'] = '5'

In [93]:
list_7 = '|'.join(list_7)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_7)), 'category'] = '7'

In [94]:
list_9 = '|'.join(list_9)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_9)), 'category'] = '9'

In [95]:
list_11 = '|'.join(list_11)
cj_new.loc[(cj_new['category']=='12')&(cj_new['product_name'].str.contains(list_11)), 'category'] = '11'
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday,event_product,category
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0,0,0
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0,0,0
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0,0,1
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1,0,0
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1,0,3
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1,0,10
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1,0,4
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1,0,5


In [96]:
# 12로 대체했던 결측치가 모두 사라짐
cj_new['category'].value_counts()

6        4378
0        4284
1        3319
3        2341
10       1962
4        1126
5         967
8         515
3,6       323
9         193
11         87
0,4        51
2          37
11,10      29
11,4       23
6,4        10
11,1        7
11,0        5
9,8         2
3,2         1
Name: category, dtype: Int64

In [97]:
# 이후 num_category 변수 생성을 위해 변수 추가
cj_new['num_category'] = cj_new['category']

카테고리 축약
- 요리 : 국/김치/김/반찬/두부, 스팸/닭가슴살/소시지, 신선식품 (0, 1, 2)
- 간식 : 만두/피자/치킨, 핫도그/떡볶이/간식, 돈까스/함박/구이 (3, 4, 5)
- 식사 : 밥/죽/면, 밀키트 (6, 7)
- 기타 : 건강식품, 음료/생수/시럽, 양념/소스/가루/오일, 대용량 식자재 (8, 9, 10, 11)

In [98]:
# 카테고리를 축약하기 위해 0-11로 표현되어 있던 카테고리를 0-3으로 표현
# 앞서 중복 카테고리를 포함하고 있던 데이터도 분류
def cate(x):
    if x=='0' or x=='1' or x=='2' : return '0'
    elif x=='3' or x=='4' or x=='5' : return '1'
    elif x=='6' or x=='7' : return '2'
    elif x=='6,3' or x=='6,4' : return '1,2'
    elif x=='0,4' or x=='3,2' : return '0,1'
    elif x=='10,11' or x=='8,9' : return '3'
    elif x=='0,11' or x=='11,1' : return '0,3'
    elif x=='11,4' : return '1,3'
    else : return '3'

In [99]:
cj_new['category'] = cj_new['category'].apply(cate)
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday,event_product,category,num_category
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0,0,0,0
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0,0,0,0
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0,0,0,1
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1,0,0,0
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1,0,1,3
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1,0,3,10
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1,0,1,4
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1,0,1,5


In [100]:
# amt/qty 변수 생성
cj_new['amt/qty'] = cj_new['net_order_amt']/cj_new['net_order_qty']
cj_new

Unnamed: 0,scd,product_name,net_order_qty,net_order_amt,gender,age_grp,employee_yn,order_date,weekday,event_product,category,num_category,amt/qty
0,20230101964282,비비고 스팸부대찌개 460g,1,8.161946,1,4,N,20230101,0,0,0,0,8.161946
1,20230101970142,삼호 생선살어묵 야채 200g,1,8.098947,1,4,Y,20230101,0,0,0,0,8.098947
2,20230101965237,크레잇 블랙페퍼 폭찹 스테이크,1,8.277412,0,4,N,20230101,0,0,0,1,8.277412
3,20230102973798,비비고 소고기 미역국 500g,3,9.145375,0,3,N,20230102,1,0,0,0,3.048458
4,20230101965633,비비고 소고기 미역국 500g,2,8.613230,0,3,N,20230101,0,0,0,0,4.306615
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,비비고 찐만두168g,1,7.930566,1,5,Y,20230103,1,0,1,3,7.930566
19656,20230102979023,사골곰탕 한 그릇 100g (1-2인분X5개입),1,8.477412,0,3,Y,20230102,1,0,3,10,8.477412
19657,20230103989581,쁘띠첼 자몽상큼함듬뿍워터젤리 130ml,10,9.421249,0,3,Y,20230103,1,0,1,4,0.942125
19658,20230104999357,비비고 언양식 바싹불고기 460g,1,8.594895,1,4,N,20230104,1,0,1,5,8.594895


In [101]:
cj_new = cj_new.rename(columns={'net_order_qty': 'total_qty', 'net_order_amt': 'total_amt'})

In [102]:
cj_new = cj_new[['scd', 'gender', 'age_grp', 'weekday', 'total_qty', 'total_amt', 'category', 'event_product', 'num_category', 'amt/qty', 'employee_yn']]
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn
0,20230101964282,1,4,0,1,8.161946,0,0,0,8.161946,N
1,20230101970142,1,4,0,1,8.098947,0,0,0,8.098947,Y
2,20230101965237,0,4,0,1,8.277412,0,0,1,8.277412,N
3,20230102973798,0,3,1,3,9.145375,0,0,0,3.048458,N
4,20230101965633,0,3,0,2,8.613230,0,0,0,4.306615,N
...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,1,5,1,1,7.930566,1,0,3,7.930566,Y
19656,20230102979023,0,3,1,1,8.477412,3,0,10,8.477412,Y
19657,20230103989581,0,3,1,10,9.421249,1,0,4,0.942125,Y
19658,20230104999357,1,4,1,1,8.594895,1,0,5,8.594895,N


- scd : 주문번호
- gender : 성별 (1 = Male; 0 = Female)
- age_grp : 나이대 (1 = 10대; 2 = 20대; 3 = 30대; 4 = 40대; 5 = 50대; 6 = 60대)
- weekday : 주문일자 (1 = 평일; 0 = 주말 및 공휴일)
- total_qty : 총 주문 수량
- total_amt : 총 주문 금액 (정규화)
- category : 구매한 카테고리 (0 = 요리; 1 = 간식; 2 = 식사; 3 = 기타)
- event_product : 이벤트 할인 제품 총 구매 횟수
- num_category : 구매 제품의 카테고리 개수
- amt/qty : 구매 수량 대비 구매 금액
- prime_yn : 프라임 회원 유무 (1 = 프라임 회원; 0 = 일반 회원)

In [103]:
cj_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19660 entries, 0 to 19659
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   scd            19660 non-null  int64  
 1   gender         19660 non-null  int64  
 2   age_grp        19660 non-null  int64  
 3   weekday        19660 non-null  int32  
 4   total_qty      19660 non-null  int64  
 5   total_amt      19660 non-null  float64
 6   category       19660 non-null  object 
 7   event_product  19660 non-null  int32  
 8   num_category   19660 non-null  string 
 9   amt/qty        19660 non-null  float64
 10  employee_yn    19660 non-null  object 
dtypes: float64(2), int32(2), int64(4), object(2), string(1)
memory usage: 1.6+ MB


In [104]:
cj_new = cj_new.astype({'category': 'string', 'num_category':'string'})

In [105]:
# category 변수를 리스트로 표현
temp=[]
for i in range(len(cj_new)):
    str = cj_new['category'].iloc[i]
    temp.append(str.split(','))
    print(str.split(','))

['0']
['0']
['0']
['0']
['0']
['1']
['0']
['1']
['0']
['2']
['1']
['1']
['0']
['0']
['3']
['0']
['0']
['0']
['3']
['1']
['1']
['2']
['2']
['3']
['3']
['2']
['3']
['3']
['0']
['2']
['1']
['0']
['3']
['3']
['1']
['2']
['0']
['0']
['3']
['2']
['0']
['3']
['0']
['0']
['2']
['0']
['0']
['2']
['0']
['3']
['0']
['0']
['2']
['1']
['3']
['0']
['3']
['0']
['0']
['0']
['0']
['2']
['3']
['0']
['3']
['3']
['3']
['2']
['3']
['0']
['3']
['2']
['3']
['0']
['0']
['3']
['0']
['2']
['2']
['2']
['0']
['0']
['0']
['0']
['3']
['3']
['2']
['0']
['0']
['2']
['3']
['3']
['0']
['1']
['2']
['2']
['1']
['2']
['1']
['2']
['3']
['3']
['0']
['0']
['2']
['0']
['2']
['3']
['3']
['0']
['0']
['2']
['3']
['0']
['3']
['3']
['3']
['0']
['3']
['0']
['0']
['3']
['3']
['3']
['0']
['1']
['3']
['1']
['2']
['0']
['1']
['3']
['1']
['1']
['3']
['2']
['0']
['3']
['2']
['0']
['0']
['2']
['0']
['0']
['3']
['2']
['3']
['0']
['3']
['2']
['0']
['3']
['1']
['2']
['0']
['3']
['2']
['3']
['3']
['0']
['2']
['2']
['2']
['2']
['3']
['3']
['3'

In [106]:
ctg_temp = pd.DataFrame({
    'ctg' : temp
})
ctg_temp

Unnamed: 0,ctg
0,[0]
1,[0]
2,[0]
3,[0]
4,[0]
...,...
19655,[1]
19656,[3]
19657,[1]
19658,[1]


In [107]:
cj_new['category'] = ctg_temp['ctg']
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn
0,20230101964282,1,4,0,1,8.161946,[0],0,0,8.161946,N
1,20230101970142,1,4,0,1,8.098947,[0],0,0,8.098947,Y
2,20230101965237,0,4,0,1,8.277412,[0],0,1,8.277412,N
3,20230102973798,0,3,1,3,9.145375,[0],0,0,3.048458,N
4,20230101965633,0,3,0,2,8.613230,[0],0,0,4.306615,N
...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,1,5,1,1,7.930566,[1],0,3,7.930566,Y
19656,20230102979023,0,3,1,1,8.477412,[3],0,10,8.477412,Y
19657,20230103989581,0,3,1,10,9.421249,[1],0,4,0.942125,Y
19658,20230104999357,1,4,1,1,8.594895,[1],0,5,8.594895,N


In [108]:
for i in range(len(cj_new)):
    cj_new['category'].iloc[i] = list(set(cj_new['category'].iloc[i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_new['category'].iloc[i] = list(set(cj_new['category'].iloc[i]))


In [109]:
cj_new['category'].value_counts()

[0]       7640
[1]       4434
[2]       4378
[3]       3116
[1, 0]      52
[3, 1]      23
[2, 1]      10
[3, 0]       7
Name: category, dtype: int64

In [110]:
for i in range(len(cj_new)):
    cj_new['category'].iloc[i] = ','.join(s for s in cj_new['category'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_new['category'].iloc[i] = ','.join(s for s in cj_new['category'].iloc[i])


In [111]:
cj_new = cj_new.astype({'category': 'string'})

In [112]:
cj_new

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn
0,20230101964282,1,4,0,1,8.161946,0,0,0,8.161946,N
1,20230101970142,1,4,0,1,8.098947,0,0,0,8.098947,Y
2,20230101965237,0,4,0,1,8.277412,0,0,1,8.277412,N
3,20230102973798,0,3,1,3,9.145375,0,0,0,3.048458,N
4,20230101965633,0,3,0,2,8.613230,0,0,0,4.306615,N
...,...,...,...,...,...,...,...,...,...,...,...
19655,20230103986821,1,5,1,1,7.930566,1,0,3,7.930566,Y
19656,20230102979023,0,3,1,1,8.477412,3,0,10,8.477412,Y
19657,20230103989581,0,3,1,10,9.421249,1,0,4,0.942125,Y
19658,20230104999357,1,4,1,1,8.594895,1,0,5,8.594895,N


In [113]:
# 주문번호 별로 합치기
# gender, age_grp, weekday, employee_yn -> 동일한 값
# total_qty, total_amt, event_product -> sum
# category -> 카테고리 리스트
# num_category -> nunique
# amt/qty -> mean
cj_grp = cj_new.groupby('scd').agg({'gender': pd.Series.mode,
                                   'age_grp': pd.Series.mode,
                                   'weekday': pd.Series.mode,
                                   'total_qty': 'sum',
                                   'total_amt': 'sum',
                                   'category': lambda x: ",".join(set(x)),
                                   'event_product': 'sum',
                                   'num_category': pd.Series.nunique,
                                   'amt/qty': 'mean', 
                                   'employee_yn': pd.Series.mode})
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20230101963221,0,3,0,1,7.959975,1,0,1,7.959975,N
20230101963226,1,3,0,8,37.321156,12,0,2,4.665145,Y
20230101963229,1,5,0,1,9.003808,2,0,1,9.003808,Y
20230101963246,1,2,0,2,9.574080,1,0,1,4.787040,Y
20230101963265,0,3,0,5,43.665790,120,0,3,8.733158,Y
...,...,...,...,...,...,...,...,...,...,...
20230131216228,0,3,1,2,9.021961,3,0,1,4.510980,N
20230131216338,1,4,1,3,16.394430,20,0,2,6.092155,N
20230131216371,0,3,1,2,16.019650,10,0,2,8.009825,N
20230131216446,0,4,1,1,8.655214,3,0,1,8.655214,N


In [114]:
# category 변수를 각 주문번호 당 구매 카테고리 여부를 0/1로 표현하는 ctg_0, 1, 2, 3 변수로 표현
cj_grp['category'].value_counts()

0                  1687
2                   813
3                   679
1                   536
2,0                 454
1,0                 361
3,0                 311
1,2,0               173
3,1                 117
3,2                 117
2,3,0               113
3,2,0               110
2,1,0               105
2,1                 103
1,2                  91
2,3,1,0              87
3,1,0                85
1,3,2,0              84
1,2,3,0              82
1,3,0                68
2,3,1                49
1,3,2                40
2,1,1,0,3,0           6
1,0,0                 5
3,1,2                 4
3,1,0,2               3
1,3,1,0,0             3
3,1,3,0               3
2,1,3,1,3,0           2
2,1,0,0               2
2,3,1,0,1             2
3,1,0,1,0             2
1,1,0                 2
1,2,1,0,0             2
1,1,0,0               2
1,3,1,0               2
1,0,2                 1
3,2,0,2,1             1
3,0,1                 1
1,3,1,0,2             1
1,2,1,0               1
2,1,0,1,0       

In [115]:
cj_grp['num_category'].value_counts()

1    3351
2    1422
3     774
4     442
5     214
6      91
7      25
8       8
9       1
Name: num_category, dtype: int64

In [116]:
for i in range(len(cj_grp)):
    str = cj_grp['category'].iloc[i]
    cj_grp['category'].iloc[i] = ','.join(dict.fromkeys(str))
    print(cj_grp['category'].iloc[i]) # abc

1
1,,,2
2
1
1,,,2,0
3,,,0
0
1
1,,,2,0
1
2
2
3,,,1
1
0
3
3
3
1
1,,,0
1,,,2
1,,,2
2
0
3,,,0
3,,,0
1,,,0
1
3
2
1
3
1,,,2,0
3,,,2,0
1
0
1,,,2,0
0
1
2,,,0
1,,,0
1
1
0
1
3,,,0
0
1,,,0
2
1
3
0
1
1,,,0
2,,,1
3,,,0
2,,,0
3
1,,,0
1,,,0
3
2
2,,,3,1
3,,,1,0,2
1,,,2,3,0
2,,,1
3,,,1,0
3,,,1,0
1,,,0
2
3,,,0
2,,,1
2
2
2,,,0
0
1
3
0
2
3,,,1
1,,,0
1,,,0
0
0
2,,,1,3,0
1,,,3,0
3,,,0
3,,,0
3,,,1
0
3
1,,,2,0
3
2
2,,,1,0
0
1,,,3,2,0
1,,,3,2,0
1
0
1
0
2,,,0
2
1
1
3,,,1
1,,,0
0
0
1
2,,,0
1
1,,,2,3,0
2,,,3,1,0
2,,,0
2,,,1
3,,,2,0
3,,,0
2,,,0
3,,,2
0
1
2,,,1,0,3
2
1,,,0
0
2
2,,,0
0
2,,,0
0
0
0
3,,,0
3
0
0
2
3
3
2
2,,,0
1,,,2,3,0
0
0
3,,,0
3
3
3,,,2,0
1,,,0
3
1,,,2
2,,,0
2
1
2,,,3,0
1
0
3,,,1
1,,,0
0
2
0
2
3
3,,,1,0,2
0
3,,,2
3,,,0
1,,,2,3,0
2,,,3,1,0
3,,,1
0
2,,,3,0
2,,,3,0
0
0
1
3,,,2,0
1
1,,,2,3,0
0
2,,,0
2
1,,,0
1,,,2
2,,,3,1,0
2,,,3,1
3,,,1
1
1
0
1,,,3,2,0
1
1,,,3,0
3
0
3,,,0
3
0
2
0
3,,,0
2
3,,,0
0
0
1,,,3,2
2
3
1,,,0
2
2,,,0
0
2,,,1
2,,,0
3
3,,,2,0
0
3,,,0
1,,,0
2
1,,,3,2,0
2
1,,,0
1
3
1
0
3,,,0
2,,,3,1,0


In [117]:
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20230101963221,0,3,0,1,7.959975,1,0,1,7.959975,N
20230101963226,1,3,0,8,37.321156,"1,,,2",0,2,4.665145,Y
20230101963229,1,5,0,1,9.003808,2,0,1,9.003808,Y
20230101963246,1,2,0,2,9.574080,1,0,1,4.787040,Y
20230101963265,0,3,0,5,43.665790,"1,,,2,0",0,3,8.733158,Y
...,...,...,...,...,...,...,...,...,...,...
20230131216228,0,3,1,2,9.021961,3,0,1,4.510980,N
20230131216338,1,4,1,3,16.394430,"2,,,0",0,2,6.092155,N
20230131216371,0,3,1,2,16.019650,"1,,,0",0,2,8.009825,N
20230131216446,0,4,1,1,8.655214,3,0,1,8.655214,N


In [118]:
cj_grp['ctg_0']=0
cj_grp['ctg_1']=0
cj_grp['ctg_2']=0
cj_grp['ctg_3']=0

In [119]:
ctg = ['0', '1', '2', '3']

for i in range(len(cj_grp)):
    for j in ctg:
        if j in cj_grp['category'].iloc[i]:
            col = 'ctg_' + j
            cj_grp[col].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cj_grp[col].iloc[i] = 1


In [120]:
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,category,event_product,num_category,amt/qty,employee_yn,ctg_0,ctg_1,ctg_2,ctg_3
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20230101963221,0,3,0,1,7.959975,1,0,1,7.959975,N,0,1,0,0
20230101963226,1,3,0,8,37.321156,"1,,,2",0,2,4.665145,Y,0,1,1,0
20230101963229,1,5,0,1,9.003808,2,0,1,9.003808,Y,0,0,1,0
20230101963246,1,2,0,2,9.574080,1,0,1,4.787040,Y,0,1,0,0
20230101963265,0,3,0,5,43.665790,"1,,,2,0",0,3,8.733158,Y,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216228,0,3,1,2,9.021961,3,0,1,4.510980,N,0,0,0,1
20230131216338,1,4,1,3,16.394430,"2,,,0",0,2,6.092155,N,1,0,1,0
20230131216371,0,3,1,2,16.019650,"1,,,0",0,2,8.009825,N,1,1,0,0
20230131216446,0,4,1,1,8.655214,3,0,1,8.655214,N,0,0,0,1


In [121]:
cj_grp.drop(['category'], axis = 1, inplace=True)

In [122]:
cj_grp = cj_grp[['gender', 'age_grp', 'weekday', 'total_qty', 'total_amt', 'event_product', 'num_category', 'ctg_0', 'ctg_1', 'ctg_2', 'ctg_3', 'amt/qty', 'employee_yn']]
cj_grp

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,employee_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20230101963221,0,3,0,1,7.959975,0,1,0,1,0,0,7.959975,N
20230101963226,1,3,0,8,37.321156,0,2,0,1,1,0,4.665145,Y
20230101963229,1,5,0,1,9.003808,0,1,0,0,1,0,9.003808,Y
20230101963246,1,2,0,2,9.574080,0,1,0,1,0,0,4.787040,Y
20230101963265,0,3,0,5,43.665790,0,3,1,1,1,0,8.733158,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216228,0,3,1,2,9.021961,0,1,0,0,0,1,4.510980,N
20230131216338,1,4,1,3,16.394430,0,2,1,0,1,0,6.092155,N
20230131216371,0,3,1,2,16.019650,0,2,1,1,0,0,8.009825,N
20230131216446,0,4,1,1,8.655214,0,1,0,0,0,1,8.655214,N


In [123]:
# 임직원 데이터
cj_y = cj_grp[cj_grp['employee_yn']=='Y']
cj_y = cj_y.drop(['employee_yn'], axis = 1)

In [124]:
cj_y.to_csv('cj_empY.csv')

In [125]:
# 비임직원 데이터
cj_n = cj_grp[cj_grp['employee_yn']=='N']
cj_n = cj_n.drop(['employee_yn'], axis = 1)

In [126]:
cj_n.to_csv('cj_empN.csv')

## 모델링

### 임직원
1. Gradient Boosting

In [127]:
df_emp_y = pd.read_csv('cj_empY_train.csv')
df_emp_y

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,prime_yn
0,20230101963226,1,3,0,6,39.433806,2,1,0,0,1,0,7.609629,0
1,20230101963235,0,4,0,2,19.234566,2,2,1,0,0,0,9.617283,1
2,20230101963247,1,3,0,6,45.516855,0,3,1,1,0,0,8.070212,1
3,20230101963251,0,3,0,2,10.845621,0,1,0,0,0,1,5.422811,1
4,20230101963253,1,2,0,6,47.014742,0,3,1,1,0,0,8.445540,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4708,20230131216567,0,5,1,6,27.652922,1,2,1,0,0,0,5.733350,1
4709,20230131216610,1,3,1,12,81.994470,6,4,1,0,1,1,8.342528,1
4710,20230131216674,1,4,1,4,37.317843,0,2,0,1,0,0,9.329461,0
4711,20230131216734,1,5,1,3,29.500314,3,3,1,0,1,1,9.833438,1


In [128]:
df_emp_y.set_index('scd',inplace=True)
df_emp_y

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,prime_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20230101963226,1,3,0,6,39.433806,2,1,0,0,1,0,7.609629,0
20230101963235,0,4,0,2,19.234566,2,2,1,0,0,0,9.617283,1
20230101963247,1,3,0,6,45.516855,0,3,1,1,0,0,8.070212,1
20230101963251,0,3,0,2,10.845621,0,1,0,0,0,1,5.422811,1
20230101963253,1,2,0,6,47.014742,0,3,1,1,0,0,8.445540,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216567,0,5,1,6,27.652922,1,2,1,0,0,0,5.733350,1
20230131216610,1,3,1,12,81.994470,6,4,1,0,1,1,8.342528,1
20230131216674,1,4,1,4,37.317843,0,2,0,1,0,0,9.329461,0
20230131216734,1,5,1,3,29.500314,3,3,1,0,1,1,9.833438,1


In [129]:
# X, y 나누기
X = df_emp_y.drop('prime_yn', axis=1)
y = df_emp_y['prime_yn']

In [130]:
# train, valid 데이터셋 나누기
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(3534, 12) (1179, 12) (3534,) (1179,)


In [131]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# 모델 학습, 파라미터 조정
model_1 = GradientBoostingClassifier()
model_1.fit(X_train, y_train)

# 예측
pred = model_1.predict(X_valid)

# 정확도
accuracy = accuracy_score(y_valid, pred)
print("Accuracy:", accuracy)

# 평가
cfreport = classification_report(y_valid, pred)
print("분류예측 레포트:\n", cfreport)

f1 = f1_score(y_valid, pred)
print("f1 score:\n", f1)

Accuracy: 0.6522476675148431
분류예측 레포트:
               precision    recall  f1-score   support

           0       0.67      0.25      0.37       471
           1       0.65      0.92      0.76       708

    accuracy                           0.65      1179
   macro avg       0.66      0.59      0.56      1179
weighted avg       0.66      0.65      0.60      1179

f1 score:
 0.7599531615925059


파라미터 조정하지 않은 모델이 가장 높은 성능을 보임

2. XGBoost

In [132]:
# XGBoost
import xgboost as xgb

# 모델 학습, 파라미터 조정
model_2 = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.02, max_depth=3, min_child_weight=3, colsample_bytree=0.5, reg_alpha=0.02)
model_2.fit(X_train, y_train)

# 예측
pred = model_2.predict(X_valid)

# 정확도
accuracy = accuracy_score(y_valid, pred)
print("Accuracy:", accuracy)

# 평가
cfreport = classification_report(y_valid, pred)
print("분류예측 레포트:\n", cfreport)

f1 = f1_score(y_valid, pred)
print("f1 score:\n", f1)

Accuracy: 0.6480067854113656
분류예측 레포트:
               precision    recall  f1-score   support

           0       0.63      0.29      0.39       471
           1       0.65      0.89      0.75       708

    accuracy                           0.65      1179
   macro avg       0.64      0.59      0.57      1179
weighted avg       0.64      0.65      0.61      1179

f1 score:
 0.7519426180514047


Gradient Boosting 모델 성능이 더 좋음

### 비임직원
1. Gradient Boosting

In [133]:
df_emp_n = pd.read_csv('cj_empN_train.csv')
df_emp_n

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,prime_yn
0,20230101963244,0,4,0,3,10.234373,0,1,0,1,0,0,3.411458,1
1,20230101963277,1,3,0,1,9.190036,0,1,0,0,0,1,9.190036,1
2,20230101963302,0,4,0,17,83.446280,0,7,1,1,1,1,7.574948,0
3,20230101963319,1,4,0,23,73.452653,0,3,1,1,1,0,4.093077,0
4,20230101963336,0,3,0,30,119.943496,0,7,1,1,1,1,6.208069,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5935,20230131216709,0,4,1,6,27.526347,0,2,1,0,0,1,7.037744,1
5936,20230131216756,0,4,1,3,18.314758,0,2,0,1,0,0,6.796144,0
5937,20230131216771,0,4,1,2,9.125762,0,1,1,0,0,0,4.562881,1
5938,20230131216842,1,3,1,1,10.176411,1,1,0,0,1,0,10.176411,0


In [134]:
df_emp_n.set_index('scd',inplace=True)
df_emp_n

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty,prime_yn
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20230101963244,0,4,0,3,10.234373,0,1,0,1,0,0,3.411458,1
20230101963277,1,3,0,1,9.190036,0,1,0,0,0,1,9.190036,1
20230101963302,0,4,0,17,83.446280,0,7,1,1,1,1,7.574948,0
20230101963319,1,4,0,23,73.452653,0,3,1,1,1,0,4.093077,0
20230101963336,0,3,0,30,119.943496,0,7,1,1,1,1,6.208069,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216709,0,4,1,6,27.526347,0,2,1,0,0,1,7.037744,1
20230131216756,0,4,1,3,18.314758,0,2,0,1,0,0,6.796144,0
20230131216771,0,4,1,2,9.125762,0,1,1,0,0,0,4.562881,1
20230131216842,1,3,1,1,10.176411,1,1,0,0,1,0,10.176411,0


In [135]:
# X, y 나누기
X = df_emp_n.drop('prime_yn', axis=1)
y = df_emp_n['prime_yn']

In [136]:
# train, valid 데이터셋 나누기
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(4455, 12) (1485, 12) (4455,) (1485,)


In [137]:
# XGBoost
import xgboost as xgb

# 모델 학습, 파라미터 조정
model_3 = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=500)
model_3.fit(X_train, y_train)

# 예측
pred = model_3.predict(X_valid)

# 정확도
accuracy = accuracy_score(y_valid, pred)
print("Accuracy:", accuracy)

# 평가
cfreport = classification_report(y_valid, pred)
print("분류예측 레포트:\n", cfreport)

f1 = f1_score(y_valid, pred)
print("f1 score:\n", f1)

Accuracy: 0.6397306397306397
분류예측 레포트:
               precision    recall  f1-score   support

           0       0.67      0.75      0.71       861
           1       0.59      0.49      0.53       624

    accuracy                           0.64      1485
   macro avg       0.63      0.62      0.62      1485
weighted avg       0.63      0.64      0.63      1485

f1 score:
 0.5335658238884046


## prime_yn 예측
### 임직원

In [138]:
test_data_emp_y = pd.read_csv('cj_empY.csv')
test_data_emp_y

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty
0,20230101963226,1,3,0,8,37.321156,0,2,0,1,1,0,4.665145
1,20230101963229,1,5,0,1,9.003808,0,1,0,0,1,0,9.003808
2,20230101963246,1,2,0,2,9.574080,0,1,0,1,0,0,4.787040
3,20230101963265,0,3,0,5,43.665790,0,3,1,1,1,0,8.733158
4,20230101963284,0,4,0,3,24.197278,0,2,1,0,0,1,8.065759
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2682,20230131215141,1,4,1,11,21.044025,0,1,0,0,0,1,5.510522
2683,20230131215302,0,4,1,2,18.579710,0,2,0,0,0,1,9.289855
2684,20230131215537,1,3,1,1,8.847216,0,1,0,1,0,0,8.847216
2685,20230131215893,1,3,1,1,8.302266,0,1,0,0,1,0,8.302266


In [139]:
test_data_emp_y.set_index('scd',inplace=True)
test_data_emp_y

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20230101963226,1,3,0,8,37.321156,0,2,0,1,1,0,4.665145
20230101963229,1,5,0,1,9.003808,0,1,0,0,1,0,9.003808
20230101963246,1,2,0,2,9.574080,0,1,0,1,0,0,4.787040
20230101963265,0,3,0,5,43.665790,0,3,1,1,1,0,8.733158
20230101963284,0,4,0,3,24.197278,0,2,1,0,0,1,8.065759
...,...,...,...,...,...,...,...,...,...,...,...,...
20230131215141,1,4,1,11,21.044025,0,1,0,0,0,1,5.510522
20230131215302,0,4,1,2,18.579710,0,2,0,0,0,1,9.289855
20230131215537,1,3,1,1,8.847216,0,1,0,1,0,0,8.847216
20230131215893,1,3,1,1,8.302266,0,1,0,0,1,0,8.302266


In [140]:
# 임직원 : Gradient Boosting 모델로 예측 수행
X_test = test_data_emp_y.copy()
pred = model_1.predict(X_test)
pred

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [141]:
# scd & 예측값 데이터프레임
scd = test_data_emp_y.index.tolist()
pred_emp_y = pd.DataFrame(pred,index = scd)
pred_emp_y

Unnamed: 0,0
20230101963226,1
20230101963229,1
20230101963246,0
20230101963265,1
20230101963284,1
...,...
20230131215141,1
20230131215302,1
20230131215537,1
20230131215893,1


In [142]:
# 컬럼 이름 변경
pred_emp_y.rename(columns={0: 'prime_yn_1'},inplace=True)
pred_emp_y = pred_emp_y.rename_axis('scd')
pred_emp_y

Unnamed: 0_level_0,prime_yn_1
scd,Unnamed: 1_level_1
20230101963226,1
20230101963229,1
20230101963246,0
20230101963265,1
20230101963284,1
...,...
20230131215141,1
20230131215302,1
20230131215537,1
20230131215893,1


### 비임직원

In [143]:
test_data_emp_n = pd.read_csv('cj_empN.csv')
test_data_emp_n

Unnamed: 0,scd,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty
0,20230101963221,0,3,0,1,7.959975,0,1,0,1,0,0,7.959975
1,20230101963286,0,3,0,3,9.857496,0,1,1,0,0,0,3.285832
2,20230101963302,0,4,0,2,16.855427,0,2,0,1,0,0,8.427714
3,20230101963306,0,3,0,14,91.066060,0,4,1,1,1,0,7.163753
4,20230101963315,0,3,0,4,10.185843,0,1,0,1,0,0,2.546461
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3636,20230131216228,0,3,1,2,9.021961,0,1,0,0,0,1,4.510980
3637,20230131216338,1,4,1,3,16.394430,0,2,1,0,1,0,6.092155
3638,20230131216371,0,3,1,2,16.019650,0,2,1,1,0,0,8.009825
3639,20230131216446,0,4,1,1,8.655214,0,1,0,0,0,1,8.655214


In [144]:
test_data_emp_n.set_index('scd',inplace=True)
test_data_emp_n

Unnamed: 0_level_0,gender,age_grp,weekday,total_qty,total_amt,event_product,num_category,ctg_0,ctg_1,ctg_2,ctg_3,amt/qty
scd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20230101963221,0,3,0,1,7.959975,0,1,0,1,0,0,7.959975
20230101963286,0,3,0,3,9.857496,0,1,1,0,0,0,3.285832
20230101963302,0,4,0,2,16.855427,0,2,0,1,0,0,8.427714
20230101963306,0,3,0,14,91.066060,0,4,1,1,1,0,7.163753
20230101963315,0,3,0,4,10.185843,0,1,0,1,0,0,2.546461
...,...,...,...,...,...,...,...,...,...,...,...,...
20230131216228,0,3,1,2,9.021961,0,1,0,0,0,1,4.510980
20230131216338,1,4,1,3,16.394430,0,2,1,0,1,0,6.092155
20230131216371,0,3,1,2,16.019650,0,2,1,1,0,0,8.009825
20230131216446,0,4,1,1,8.655214,0,1,0,0,0,1,8.655214


In [145]:
# 비임직원 : XGBoost 모델로 예측 수행
X_test = test_data_emp_n.copy()
pred = model_3.predict(X_test)
pred

array([0, 1, 0, ..., 0, 0, 0])

In [146]:
# scd & 예측값 데이터프레임
scd = test_data_emp_n.index.tolist()
pred_emp_n = pd.DataFrame(pred,index = scd)
pred_emp_n

Unnamed: 0,0
20230101963221,0
20230101963286,1
20230101963302,0
20230101963306,0
20230101963315,0
...,...
20230131216228,0
20230131216338,0
20230131216371,0
20230131216446,0


In [147]:
# 인덱스, 컬럼 이름 변경
pred_emp_n.rename(columns={0: 'prime_yn_2'},inplace=True)
pred_emp_n = pred_emp_n.rename_axis('scd')
pred_emp_n

Unnamed: 0_level_0,prime_yn_2
scd,Unnamed: 1_level_1
20230101963221,0
20230101963286,1
20230101963302,0
20230101963306,0
20230101963315,0
...,...
20230131216228,0
20230131216338,0
20230131216371,0
20230131216446,0


In [148]:
# test data scd 추출
test_data = pd.read_csv('tmk_bda_test.csv')
test_data.set_index('scd',inplace=True)
test_data = test_data.drop(columns=test_data.columns)
test_data

20230101964282
20230101970142
20230101965237
20230102973798
20230101965633
...
20230103986821
20230102979023
20230103989581
20230104999357
20230103989710


In [149]:
# test data 주문번호에 맞게 예측값 정렬 후 csv 파일로 저장
submission = pd.concat([test_data, pred_emp_y, pred_emp_n], axis=1)
submission

Unnamed: 0_level_0,prime_yn_1,prime_yn_2
scd,Unnamed: 1_level_1,Unnamed: 2_level_1
20230101964282,,1.0
20230101970142,1.0,
20230101965237,,0.0
20230102973798,,0.0
20230101965633,,0.0
...,...,...
20230103986821,1.0,
20230102979023,1.0,
20230103989581,0.0,
20230104999357,,0.0


In [150]:
# prime_yn 컬럼 합치기
submission['prime_yn'] = submission['prime_yn_1'].fillna(0.0) + submission['prime_yn_2'].fillna(0.0)
submission.drop(['prime_yn_1', 'prime_yn_2'], axis=1, inplace=True)
submission['prime_yn'] = submission['prime_yn'].astype(int)
submission

Unnamed: 0_level_0,prime_yn
scd,Unnamed: 1_level_1
20230101964282,1
20230101970142,1
20230101965237,0
20230102973798,0
20230101965633,0
...,...
20230103986821,1
20230102979023,1
20230103989581,0
20230104999357,0


In [151]:
submission['prime_yn'].value_counts()

1    11004
0     8656
Name: prime_yn, dtype: int64

In [152]:
submission.reset_index(drop=False, inplace=True)
submission

Unnamed: 0,scd,prime_yn
0,20230101964282,1
1,20230101970142,1
2,20230101965237,0
3,20230102973798,0
4,20230101965633,0
...,...,...
19655,20230103986821,1
19656,20230102979023,1
19657,20230103989581,0
19658,20230104999357,0


In [153]:
submission.to_csv('submission.csv', index=False)