In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('data/nike_dunk(raw).csv')
df.head(3)

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,"125,000원",21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
1,240,"123,000원",21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
2,245,"129,000원",21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"


In [3]:
df.value_counts()

size  price     date      product                                    release_date  color                                  release_price
270   600,000원  21/08/12  Nike x Off-White Dunk Low The 50 - Lot 18  21/08/09      SAIL/NEUTRALGREY                       219,000원         24
265   440,000원  21/08/18  Nike x Off-White Dunk Low The 50 - Lot 46  21/08/16      SAIL/NEUTRALGREY                       219,000원         20
270   330,000원  21/01/23  Nike Dunk Low Retro Black                  21/01/14      WHITE/BLACK                            119,000원         20
      430,000원  21/08/18  Nike x Off-White Dunk Low The 50 - Lot 46  21/08/16      SAIL/NEUTRALGREY                       219,000원         19
275   240,000원  21/08/17  Nike Dunk Low SE Seoul                     21/08/12      BLACK/WHITE-RED-BLUE                   129,000원         19
                                                                                                                                           ..
250   254,00

In [4]:
df.shape

(223942, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223942 entries, 0 to 223941
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   size           223942 non-null  object
 1   price          223942 non-null  object
 2   date           223942 non-null  object
 3   product        223942 non-null  object
 4   release_date   223942 non-null  object
 5   color          223942 non-null  object
 6   release_price  223942 non-null  object
dtypes: object(7)
memory usage: 12.0+ MB


In [6]:
# null 값 보기 
tot = df.isnull().sum().sort_values(ascending=False)
pct = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([tot, pct], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
size,0,0.0
price,0,0.0
date,0,0.0
product,0,0.0
release_date,0,0.0
color,0,0.0
release_price,0,0.0


###  Preprocessing

In [7]:
# size 컬럼에서 사이즈만 추출  

def size (x) :
    return str(x)[:3]

df['size'] = df["size"].apply(size)

In [8]:
df['size'] = df['size'].apply(int)

In [9]:
# price 컬럼  " , " 과 " 원 " 제거 

import re

def extract_num (num_str) :
    if type(num_str) == str :
        num_str = re.sub('[^0-9]','',num_str)
    return num_str

df['price'] = df['price'].apply(extract_num)
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
1,240,123000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
2,245,129000,21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
3,275,129000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
4,240,132000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"


In [10]:
df['price'] = df['price'].apply(int)

In [11]:
# "통화기호 , 한글제거 ""

def NotWon(x):
    if '약' in x:
        r = x.find('약')
        return x[r+1:-2]
    else:
        return x[:-1]

df['release_price'] = df['release_price'].apply(NotWon)

In [12]:
# 가격누락된 컬럼 값 추가 (재확인 필요)
df.loc[df['release_price'] == '', 'release_price']  = '115,000'

In [13]:
# 특수문자 제거 
df['release_price'] = df['release_price'].apply(extract_num)
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000


In [14]:
df['release_price'] = df['release_price'].apply(int)

In [15]:
# date , release_date 컬럼 변경 

df['date'] = df["date"].str.replace('/','-')
df['release_date'] = df['release_date'].str.replace('/','-')

In [16]:
# yyyy -mm-dd 형식으로 변환 
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d')

In [17]:
#release_date 누락된 값 추가 
df.loc[df['release_date'] == '-', 'release_date']  = '21-07-22'

In [18]:
# yyyy -mm-dd 형식으로 변환 
df['release_date'] = pd.to_datetime(df['release_date'], format='%y-%m-%d')

In [19]:
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000


In [20]:
# # 향후 모델링을 고려해 날짜 형식을 Unix 형식으로 변환 

# import datetime as dt

# df['date'] = df['date'].map(dt.datetime.toordinal)
# df['release_date'] = df['release_date'].map(dt.datetime.toordinal)

In [21]:
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000


In [22]:
products = pd.read_csv('data/products/products_nike_dunk.csv')

In [23]:
products.drop('Unnamed: 0', axis=1, inplace=True)

In [24]:
print(products.shape)
products.head()

(360, 6)


Unnamed: 0,product,brand,category,number,wish,review
0,Nike Dunk Low Retro Black,Nike,Nike Dunk,28029,5.2만,7820
1,(W) Nike Dunk Low Black,Nike,Nike Dunk,28030,3.8만,2947
2,Nike Dunk Low Retro Grey Fog,Nike,Nike Dunk,41193,1.6만,84
3,(GS) Nike Dunk Low Black,Nike,Nike Dunk,28229,1.7만,1061
4,Nike Dunk High Retro Championship Navy,Nike,Nike Dunk,39628,1.6만,291


In [25]:
products['review'] = products['review'].apply(extract_num).apply(int)

In [26]:
float(products['wish'][0][:-1])

5.2

In [27]:
def number_transform(x):
    if '만' in x:
        return float(x[:-1])*10000
    else:
        return x

In [28]:
products['wish'] = products['wish'].apply(number_transform)

In [29]:
products['wish'] = products['wish'].apply(extract_num).apply(int)

In [30]:
dataset = pd.merge(df, products, how='left', on='product')
print(df.shape)
dataset.head()

(223942, 7)


Unnamed: 0,size,price,date,product,release_date,color,release_price,brand,category,number,wish,review
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0


In [31]:
transactions = dataset['product'].value_counts()

In [32]:
transactions = pd.DataFrame({'product':transactions.index , 'transactions':transactions.values})

In [33]:
dataset = pd.merge(dataset, transactions, how='left', on='product')

In [34]:
dataset

Unnamed: 0,size,price,date,product,release_date,color,release_price,brand,category,number,wish,review,transactions
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224187,240,238000,2022-01-06,(W) Nike Dunk High Black and White,2021-05-01,WHITE/BLACK-UNIVERSITYRED,129000,Nike,Nike Dunk,33866.0,9875.0,268.0,2950
224188,235,223000,2022-01-06,(W) Nike Dunk High Black and White,2021-05-01,WHITE/BLACK-UNIVERSITYRED,129000,Nike,Nike Dunk,33866.0,9875.0,268.0,2950
224189,255,260000,2022-01-06,(W) Nike Dunk High Black and White,2021-05-01,WHITE/BLACK-UNIVERSITYRED,129000,Nike,Nike Dunk,33866.0,9875.0,268.0,2950
224190,245,248000,2022-01-07,(W) Nike Dunk High Black and White,2021-05-01,WHITE/BLACK-UNIVERSITYRED,129000,Nike,Nike Dunk,33866.0,9875.0,268.0,2950


In [35]:
dataset['transactions'].value_counts()

1450     23200
18150    18150
4450      8900
7758      7758
6700      6700
         ...  
1           23
23          23
2           22
9           18
18          18
Name: transactions, Length: 211, dtype: int64

In [36]:
## 콜라보 떼어내기

In [37]:
# dataset['product'][99999]

In [38]:
# collabo = 'Nike x '
# r1 = dataset['product'][99999].find(string)
# r2 = dataset['product'][99999][r1+len(string):].find(' ')
# dataset['product'][99999][r1+len(string):r1+len(string)+r2]

In [39]:
# dataset.info()

In [40]:
dataset = dataset.reset_index(drop=True)

In [41]:
set(dataset['product'])

{'(GS) Nike Dunk High Anthracite',
 '(GS) Nike Dunk High Barely Green',
 '(GS) Nike Dunk High Black and White',
 '(GS) Nike Dunk High Cargo Khaki',
 '(GS) Nike Dunk High Championship Navy',
 '(GS) Nike Dunk High Game Royal',
 '(GS) Nike Dunk High Knicks',
 '(GS) Nike Dunk High Orange Blaze',
 '(GS) Nike Dunk High Retro Vast Grey',
 '(GS) Nike Dunk High SE First Use Sail',
 '(GS) Nike Dunk Low Black',
 '(GS) Nike Dunk Low Bright Crimson Game Royal',
 '(GS) Nike Dunk Low Championship Grey',
 '(GS) Nike Dunk Low Light Bone Tropical Twist',
 '(GS) Nike Dunk Low PRM Halloween',
 '(GS) Nike Dunk Low Pink Foam',
 '(GS) Nike Dunk Low Pink Velvet',
 '(GS) Nike Dunk Low Retro Championship Red',
 '(GS) Nike Dunk Low Retro Hyper Cobalt',
 '(GS) Nike Dunk Low SE Free.99 Black',
 '(GS) Nike Dunk Low SE Free.99 White',
 '(GS) Nike Dunk Low SE Next Nature Sequoia Olive',
 '(GS) Nike Dunk Low SE Sail Multi Camo',
 '(GS) Nike Dunk Low Ueno Panda',
 '(GS) Nike Dunk Low University Blue',
 '(GS) Nike Dunk 

In [42]:
dataset['product'].value_counts().head(20)

Nike Dunk Low Retro Black                    18150
Nike Dunk Low SE Seoul                        7758
(W) Nike Dunk Low Black                       6700
Nike x Ambush Dunk High Deep Royal Blue       4450
Nike Dunk Low Retro University Blue           4450
(W) Nike Dunk Low Light Bone                  4400
Nike x Kasina Dunk Low 80's Bus               4345
(GS) Nike Dunk Low Black                      4100
Nike Dunk Low Retro Varsity Green             4050
Nike x Off-White Dunk Low The 50 - Lot 18     3800
Nike Dunk Low Retro Medium Grey               3107
Nike x Off-White Dunk Low The 50 - Lot 46     3100
Nike Dunk Low Retro Championship Red          3000
(PS) Nike Dunk Low Black                      2985
(W) Nike Dunk High Black and White            2950
Nike Dunk High SP Varsity Maize               2900
Nike SB Dunk Low Pro Chicago                  2900
Nike Dunk High Retro Championship Navy        2853
(W) Nike Dunk Low Next Nature Pale Coral      2837
(W) Nike Dunk Low Triple White 

In [43]:
dataset.drop_duplicates()['product'].value_counts().head(20)

Nike Dunk Low Retro Black                    8290
Nike Dunk Low SE Seoul                       4480
(W) Nike Dunk Low Black                      4120
Nike x Kasina Dunk Low 80's Bus              3710
Nike x Ambush Dunk High Deep Royal Blue      3512
Nike Dunk Low Retro University Blue          3140
(GS) Nike Dunk Low Black                     2843
(W) Nike Dunk Low Light Bone                 2799
Nike SB Dunk Low Pro Chicago                 2789
(W) Nike Dunk High Black and White           2565
(PS) Nike Dunk Low Black                     2463
Nike Dunk Low Retro Varsity Green            2424
Nike x Off-White Dunk Low The 50 - Lot 18    2377
Nike Dunk Low Retro Medium Grey              2293
Nike Dunk High SP Varsity Maize              2229
Nike x Ambush Dunk High Black                2185
Nike Dunk Low Retro Championship Red         2165
Nike Dunk High Retro Championship Navy       1960
(W) Nike Dunk Low Triple White               1939
Nike Dunk High SP Maize & Blue               1905


In [44]:
# csv 파일로 저장 

dataset.to_csv('data/nike_dunk(pre).csv')