In [1]:
import numpy as np
import pandas as pd

# Load data

In [2]:
sales_train = pd.read_excel('sales_train.xlsx', header=1)
sales_test = pd.read_excel('sales_test.xlsx', header=1)

In [3]:
sales_train

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
0,2019-01-01 06:00:00,20.000000,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0
1,2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0
2,2019-01-01 06:20:00,20.000000,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0
3,2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0
4,2019-01-01 06:40:00,20.000000,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0
...,...,...,...,...,...,...,...,...
38304,2020-01-01 00:20:00,20.000000,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,0,
38305,2020-01-01 00:40:00,20.000000,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,0,
38306,2020-01-01 01:00:00,20.000000,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,0,
38307,2020-01-01 01:20:00,20.000000,100490,201478,더케이 예다함 상조서비스(티포트),무형,0,


In [4]:
sales_train.shape

(38309, 8)

In [5]:
sales_train.nunique()

방송일시     21525
노출(분)      103
마더코드       716
상품코드      2124
상품명       1770
상품군         12
판매단가       391
취급액      24140
dtype: int64

In [6]:
sales_test

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
0,2020-06-01 06:20:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
1,2020-06-01 06:40:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
2,2020-06-01 07:00:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
3,2020-06-01 07:20:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,
4,2020-06-01 07:40:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,
...,...,...,...,...,...,...,...,...
2886,2020-07-01 00:20:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,
2887,2020-07-01 00:40:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,
2888,2020-07-01 01:00:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,
2889,2020-07-01 01:20:00,20.000000,100261,200875,아놀드파마 티셔츠레깅스세트,의류,69900,


In [7]:
sales_test.shape

(2891, 8)

In [8]:
sales_test.nunique()

방송일시     1780
노출(분)      21
마더코드      225
상품코드      417
상품명       377
상품군        12
판매단가      152
취급액         0
dtype: int64

# Preprocessing

## Remove data with price==0

In [9]:
sales_test = sales_test.loc[sales_test['판매단가']!=0,]
sales_test.shape

(2716, 8)

## Replace items that are not in train data

In [10]:
lst1 = list(set(sales_test['마더코드'].unique()).intersection(set(sales_train['마더코드'].unique())))
lst2 = list(set(sales_test['상품코드'].unique()).intersection(set(sales_train['상품코드'].unique())))
lst3 = list(set(sales_test['상품명'].unique()).intersection(set(sales_train['상품명'].unique())))

In [11]:
sales_test_replace = sales_test.copy()
sales_test_replace = sales_test_replace.reset_index(drop=True)

mother_code = []
product_code = []
product_name = []


for i in range(len(sales_test_replace)):
    
    if sales_test_replace['마더코드'][i] in lst1:
        temp1 = 0
    else:
        temp1 = 1
        
    if sales_test_replace['상품코드'][i] in lst2:
        temp2 = 0
    else:
        temp2 = 1
        
    if sales_test_replace['상품명'][i] in lst3:
        temp3 = 0
    else:
        temp3 = 1
        
    mother_code.append(temp1)
    product_code.append(temp2)
    product_name.append(temp3)

In [12]:
mother_code = np.array(mother_code)
product_code = np.array(product_code)
product_name = np.array(product_name)

In [13]:
sales_test_replace = sales_test_replace.assign(mother_code=mother_code, product_code=product_code, product_name=product_name,
                                               tt_sum=mother_code+product_code+product_name)

In [14]:
sales_test_replace

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,mother_code,product_code,product_name,tt_sum
0,2020-06-01 06:20:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,1,1,1,3
1,2020-06-01 06:40:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,1,1,1,3
2,2020-06-01 07:00:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,1,1,1,3
3,2020-06-01 07:20:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,0,1,0,1
4,2020-06-01 07:40:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2711,2020-07-01 00:10:00,10.000000,100099,200273,[일시불]라쉬반 FC바로셀로나 드로즈 패키지,속옷,99000,,0,1,1,2
2712,2020-07-01 00:10:00,,100099,200272,[무이자]라쉬반 FC바로셀로나 드로즈 패키지,속옷,119000,,0,1,1,2
2713,2020-07-01 00:10:00,,100099,200274,라쉬반 FC바로셀로나 드로즈 8종,속옷,119000,,0,1,1,2
2714,2020-07-01 01:20:00,20.000000,100261,200875,아놀드파마 티셔츠레깅스세트,의류,69900,,1,1,1,3


In [15]:
len(sales_test_replace.loc[sales_test_replace['tt_sum']==0,])

181

- 마더코드,상품코드,상품명 모두 train에도 있는 경우 = 181개

In [16]:
sales_test_replace.loc[sales_test_replace['tt_sum']==1,][['mother_code','product_code','product_name']].apply(sum)

mother_code       0
product_code    272
product_name      0
dtype: int64

- 마더코드가 test에만 있는 경우(상품코드,상품명은 train에도 있음) = 0개
- 상품코드가 test에만 있는 경우(마더코드,상품명은 train에도 있음) = 272개
- 상품명이 test에만 있는 경우(마더코드,상품코드는 train에도 있음) = 0개

In [17]:
print(len(sales_test_replace.loc[sales_test_replace['tt_sum']==2,].loc[(sales_test_replace['mother_code']==1) & (sales_test_replace['product_code']==1),]))
print(len(sales_test_replace.loc[sales_test_replace['tt_sum']==2,].loc[(sales_test_replace['mother_code']==1) & (sales_test_replace['product_name']==1),]))
print(len(sales_test_replace.loc[sales_test_replace['tt_sum']==2,].loc[(sales_test_replace['product_code']==1) & (sales_test_replace['product_name']==1),]))

0
0
1066


- 마더코드,상품코드가 test에만 있는 경우(상품명은 train에도 있음) = 0개
- 마더코드,상품명이 test에만 있는 경우(상품코드는 train에도 있음) = 0개
- 상품코드,상품명이 test에만 있는 경우(마더코드는 train에도 있음) = 1066개

In [18]:
len(sales_test_replace.loc[sales_test_replace['tt_sum']==3,])

1197

- 마더코드,상품코드,상품명 모두 test에만 있는 경우 = 1197개

- 정리
 - 마더코드,상품코드,상품명 모두 train에도 있는 경우 = 181개(1순위, 별도로 추정할 칼럼 없음)
 - 상품코드가 test에만 있는 경우(마더코드,상품명은 train에도 있음) = 272개(2순위, 마더코드와 상품명 이용하여 상품코드 추정)
 - 상품코드,상품명이 test에만 있는 경우(마더코드는 train에도 있음) = 1066개(3순위, 마더코드를 이용하여 상품코드,상품명 추정)
 - 마더코드,상품코드,상품명 모두 test에만 있는 경우 = 1197개(4순위, 추정 불가능; 별도의 모델이 필요)

In [19]:
sales_test_replace1 = sales_test_replace.loc[sales_test_replace['tt_sum']==0,].reset_index(drop=True)
sales_test_replace2 = sales_test_replace.loc[(sales_test_replace['tt_sum']==1)&(sales_test_replace['product_code']==1),].reset_index(drop=True)
sales_test_replace3 = sales_test_replace.loc[(sales_test_replace['tt_sum']==2)&(sales_test_replace['product_code']==1)&((sales_test_replace['product_name']==1)),].reset_index(drop=True)
sales_test_replace4 = sales_test_replace.loc[sales_test_replace['tt_sum']==3,].reset_index(drop=True)

### sales_test_replace2

In [20]:
for i in range(len(sales_test_replace2)):
    refer1 = sales_train.loc[(sales_train['상품명']==sales_test_replace2['상품명'][i])&(sales_train['마더코드']==sales_test_replace2['마더코드'][i]),]
    if len(refer1)!=0:
        sales_test_replace2['상품명'][i] = refer1.iloc[0,4]
    else:
        refer2 = sales_train.loc[sales_train['상품명']==sales_test_replace2['상품명'][i],]
        if len(refer2)!=0:
            sales_test_replace2['상품명'][i] = refer2.iloc[0,4]
        else:
            refer3 = sales_train.loc[sales_train['마더코드']==sales_test_replace2['마더코드'][i],]
            sales_test_replace2['상품명'][i] = refer3.iloc[0,4]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_test_replace2['상품명'][i] = refer1.iloc[0,4]


In [21]:
set(sales_test_replace2['상품명'].unique()).difference(set(sales_train['상품명'].unique())) # checking

set()

### sales_test_replace3 

In [22]:
for i in range(len(sales_test_replace3)):
    refer = sales_train.loc[sales_train['마더코드']==sales_test_replace3['마더코드'][i],]
    sales_test_replace3['상품코드'][i] = refer.iloc[0,3]
    sales_test_replace3['상품명'][i] = refer.iloc[0,4]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_test_replace3['상품코드'][i] = refer.iloc[0,3]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_test_replace3['상품명'][i] = refer.iloc[0,4]


In [23]:
set(sales_test_replace3['상품코드'].unique()).difference(set(sales_train['상품코드'].unique())) # checking

set()

In [24]:
set(sales_test_replace3['상품명'].unique()).difference(set(sales_train['상품명'].unique())) # checking

set()

In [25]:
sales_test_replace_temp = pd.concat([sales_test_replace1,sales_test_replace2,sales_test_replace3])
sales_test_replace_temp = sales_test_replace_temp.assign(model=0)
sales_test_replace4 = sales_test_replace4.assign(model=1)

In [26]:
sales_test_replace = pd.concat([sales_test_replace_temp,sales_test_replace4])

In [27]:
sales_test_replace.shape

(2716, 13)

In [28]:
sales_test_replace.columns

Index(['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액',
       'mother_code', 'product_code', 'product_name', 'tt_sum', 'model'],
      dtype='object')

In [29]:
sales_test_replace.drop(['mother_code','product_code','product_name','tt_sum'], axis=1, inplace=True)

In [30]:
sales_test_replace.to_csv('sales_test_replace.csv', index=False)