In [52]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

In [124]:
df = pd.read_csv('data/nike_dunk_low(raw).csv')
df.head(3)

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,"125,000원",21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
1,240,"123,000원",21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
2,245,"129,000원",21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"


In [125]:
df.value_counts()

size  price     date      product                                    release_date  color                 release_price  
270   600,000원  21/08/12  Nike x Off-White Dunk Low The 50 - Lot 18  21/08/09      SAIL/NEUTRALGREY      219,000원           24
      330,000원  21/01/23  Nike Dunk Low Retro Black                  21/01/14      WHITE/BLACK           119,000원           20
265   440,000원  21/08/18  Nike x Off-White Dunk Low The 50 - Lot 46  21/08/16      SAIL/NEUTRALGREY      219,000원           20
275   240,000원  21/08/17  Nike Dunk Low SE Seoul                     21/08/12      BLACK/WHITE-RED-BLUE  129,000원           19
270   430,000원  21/08/18  Nike x Off-White Dunk Low The 50 - Lot 46  21/08/16      SAIL/NEUTRALGREY      219,000원           19
                                                                                                                            ..
250   415,000원  21/12/02  Nike Dunk Low Retro PRM Halloween          21/10/21      SAIL/STARFISH-BLACK   €120(약160,90

In [126]:
df.shape

(173732, 7)

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173732 entries, 0 to 173731
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   size           173732 non-null  object
 1   price          173732 non-null  object
 2   date           173732 non-null  object
 3   product        173732 non-null  object
 4   release_date   173732 non-null  object
 5   color          173732 non-null  object
 6   release_price  173732 non-null  object
dtypes: object(7)
memory usage: 9.3+ MB


In [128]:
# null 값 보기 
tot = df.isnull().sum().sort_values(ascending=False)
pct = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([tot, pct], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
size,0,0.0
price,0,0.0
date,0,0.0
product,0,0.0
release_date,0,0.0
color,0,0.0
release_price,0,0.0


###  Preprocessing

In [129]:
# size 컬럼에서 사이즈만 추출  

def size (x) :
    return str(x)[:3]

df['size'] = df["size"].apply(size)

In [130]:
df['size'] = df['size'].apply(int)

In [131]:
# price 컬럼  " , " 과 " 원 " 제거 

import re

def extract_num (num_str) :
    if type(num_str) == str :
        num_str = re.sub('[^0-9]','',num_str)
    return num_str

df['price'] = df['price'].apply(extract_num)
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
1,240,123000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
2,245,129000,21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
3,275,129000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"
4,240,132000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,"129,000원"


In [132]:
df['price'] = df['price'].apply(int)

In [133]:
# "통화기호 , 한글제거 ""

def NotWon(x):
    if '약' in x:
        r = x.find('약')
        return x[r+1:-2]
    else:
        return x[:-1]

df['release_price'] = df['release_price'].apply(NotWon)

In [134]:
# 가격누락된 컬럼 값 추가 (재확인 필요)
df.loc[df['release_price'] == '', 'release_price']  = '115,000'

In [135]:
# 특수문자 제거 
df['release_price'] = df['release_price'].apply(extract_num)
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,21/09/30,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,21/10/01,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,21/10/04,(W) Nike Dunk Low SE Primal Black,21/11/07,BLACK/MULTI-COLOR/BLACK,129000


In [136]:
df['release_price'] = df['release_price'].apply(int)

In [137]:
# date , release_date 컬럼 변경 

df['date'] = df["date"].str.replace('/','-')
df['release_date'] = df['release_date'].str.replace('/','-')

In [138]:
# yyyy -mm-dd 형식으로 변환 
df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d')

In [139]:
#release_date 누락된 값 추가 
df.loc[df['release_date'] == '-', 'release_date']  = '21-07-22'

In [140]:
# yyyy -mm-dd 형식으로 변환 
df['release_date'] = pd.to_datetime(df['release_date'], format='%y-%m-%d')

In [141]:
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000


In [142]:
# # 향후 모델링을 고려해 날짜 형식을 Unix 형식으로 변환 

# import datetime as dt

# df['date'] = df['date'].map(dt.datetime.toordinal)
# df['release_date'] = df['release_date'].map(dt.datetime.toordinal)

In [143]:
df.head()

Unnamed: 0,size,price,date,product,release_date,color,release_price
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000


In [147]:
products = pd.read_csv('data/products/products_nike_dunk.csv')

In [148]:
products.drop('Unnamed: 0', axis=1, inplace=True)

In [149]:
print(products.shape)
products.head()

(360, 6)


Unnamed: 0,product,brand,category,number,wish,review
0,Nike Dunk Low Retro Black,Nike,Nike Dunk,28029,5.2만,7820
1,(W) Nike Dunk Low Black,Nike,Nike Dunk,28030,3.8만,2947
2,Nike Dunk Low Retro Grey Fog,Nike,Nike Dunk,41193,1.6만,84
3,(GS) Nike Dunk Low Black,Nike,Nike Dunk,28229,1.7만,1061
4,Nike Dunk High Retro Championship Navy,Nike,Nike Dunk,39628,1.6만,291


In [150]:
products['review'] = products['review'].apply(extract_num).apply(int)

In [151]:
float(products['wish'][0][:-1])

5.2

In [152]:
def number_transform(x):
    if '만' in x:
        return float(x[:-1])*10000
    else:
        return x

In [153]:
products['wish'] = products['wish'].apply(number_transform)

In [154]:
products['wish'] = products['wish'].apply(extract_num).apply(int)

In [155]:
dataset = pd.merge(df, products, how='left', on='product')
print(df.shape)
dataset.head()

(173732, 7)


Unnamed: 0,size,price,date,product,release_date,color,release_price,brand,category,number,wish,review
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0


In [156]:
transactions = dataset['product'].value_counts()

In [157]:
transactions = pd.DataFrame({'product':transactions.index , 'transactions':transactions.values})

In [159]:
dataset = pd.merge(dataset, transactions, how='left', on='product')

In [160]:
dataset

Unnamed: 0,size,price,date,product,release_date,color,release_price,brand,category,number,wish,review,transactions
0,245,125000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
1,240,123000,2021-09-30,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
2,245,129000,2021-10-01,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
3,275,129000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
4,240,132000,2021-10-04,(W) Nike Dunk Low SE Primal Black,2021-11-07,BLACK/MULTI-COLOR/BLACK,129000,Nike,Nike Dunk,41947.0,93.0,4.0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173977,260,231000,2021-07-15,Nike SB Dunk Low Pro Midnight Navy,2020-11-17,MIDNIGHTNAVY/BLACK/MIDNIGHTNAVY,119000,,,,,,132
173978,260,250000,2021-08-17,Nike SB Dunk Low Pro Midnight Navy,2020-11-17,MIDNIGHTNAVY/BLACK/MIDNIGHTNAVY,119000,,,,,,132
173979,245,280000,2021-09-26,Nike SB Dunk Low Pro Midnight Navy,2020-11-17,MIDNIGHTNAVY/BLACK/MIDNIGHTNAVY,119000,,,,,,132
173980,280,250000,2021-10-05,Nike SB Dunk Low Pro Midnight Navy,2020-11-17,MIDNIGHTNAVY/BLACK/MIDNIGHTNAVY,119000,,,,,,132


In [161]:
dataset['transactions'].value_counts()

18150    18150
1450     13050
7758      7758
6700      6700
4450      4450
         ...  
18          18
14          14
1           14
2           12
5           10
Name: transactions, Length: 174, dtype: int64

In [86]:
## 콜라보 떼어내기

In [87]:
# dataset['product'][99999]

'Nike x Parra SB Dunk Low Pro Abstract Art'

In [88]:
# collabo = 'Nike x '
# r1 = dataset['product'][99999].find(string)
# r2 = dataset['product'][99999][r1+len(string):].find(' ')
# dataset['product'][99999][r1+len(string):r1+len(string)+r2]

'Parra'

In [45]:
# dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145554 entries, 0 to 145553
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   size           145554 non-null  int64         
 1   price          145554 non-null  int64         
 2   date           145554 non-null  datetime64[ns]
 3   product        145554 non-null  object        
 4   release_date   145554 non-null  datetime64[ns]
 5   color          145554 non-null  object        
 6   release_price  145554 non-null  int64         
 7   number         145553 non-null  float64       
 8   wish           145553 non-null  float64       
 9   review         145553 non-null  float64       
 10  transactions   145554 non-null  int64         
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 12.2+ MB


In [162]:
dataset = dataset.reset_index(drop=True)

In [164]:
set(dataset['product'])

{'(GS) Nike Dunk Low Black',
 '(GS) Nike Dunk Low Bright Crimson Game Royal',
 '(GS) Nike Dunk Low Championship Grey',
 '(GS) Nike Dunk Low Light Bone Tropical Twist',
 '(GS) Nike Dunk Low PRM Halloween',
 '(GS) Nike Dunk Low Pink Foam',
 '(GS) Nike Dunk Low Pink Velvet',
 '(GS) Nike Dunk Low Retro Championship Red',
 '(GS) Nike Dunk Low Retro Hyper Cobalt',
 '(GS) Nike Dunk Low SE Free.99 Black',
 '(GS) Nike Dunk Low SE Free.99 White',
 '(GS) Nike Dunk Low SE Next Nature Sequoia Olive',
 '(GS) Nike Dunk Low SE Sail Multi Camo',
 '(GS) Nike Dunk Low Ueno Panda',
 '(GS) Nike Dunk Low University Blue',
 '(GS) Nike Dunk Low Varsity Maize',
 '(GS) Nike Dunk Low White Bone Peach Aqua',
 '(GS) Nike x NBA Dunk Low Chicago',
 '(GS) Nike x NBA Dunk Low Nets',
 '(PS) Nike Dunk Low Black',
 '(PS) Nike Dunk Low Bright Crimson Game Royal',
 '(PS) Nike Dunk Low Championship Grey',
 '(PS) Nike Dunk Low Gypsy Rose',
 '(PS) Nike Dunk Low Orange Pearl',
 '(PS) Nike Dunk Low PRM Halloween',
 '(PS) Nike D

In [173]:
dataset['product'].value_counts().head(20)

Nike Dunk Low Retro Black                    18150
Nike Dunk Low SE Seoul                        7758
(W) Nike Dunk Low Black                       6700
Nike Dunk Low Retro University Blue           4450
(W) Nike Dunk Low Light Bone                  4400
Nike x Kasina Dunk Low 80's Bus               4345
(GS) Nike Dunk Low Black                      4100
Nike Dunk Low Retro Varsity Green             4050
Nike x Off-White Dunk Low The 50 - Lot 18     3800
Nike Dunk Low Retro Medium Grey               3107
Nike x Off-White Dunk Low The 50 - Lot 46     3100
Nike Dunk Low Retro Championship Red          3000
(PS) Nike Dunk Low Black                      2985
Nike SB Dunk Low Pro Chicago                  2900
(W) Nike Dunk Low Next Nature Pale Coral      2837
(W) Nike Dunk Low Triple White                2767
Nike Dunk Low Retro Varsity Maize             2750
(W) Nike Dunk Low Next Nature Sail            2594
Nike Dunk Low SP Community Garden             2161
Nike Dunk Low Retro Grey Fog   

In [174]:
dataset.drop_duplicates()['product'].value_counts().head(20)

Nike Dunk Low Retro Black                              8290
Nike Dunk Low SE Seoul                                 4480
(W) Nike Dunk Low Black                                4120
Nike x Kasina Dunk Low 80's Bus                        3710
Nike Dunk Low Retro University Blue                    3140
(GS) Nike Dunk Low Black                               2843
(W) Nike Dunk Low Light Bone                           2799
Nike SB Dunk Low Pro Chicago                           2789
(PS) Nike Dunk Low Black                               2463
Nike Dunk Low Retro Varsity Green                      2424
Nike x Off-White Dunk Low The 50 - Lot 18              2377
Nike Dunk Low Retro Medium Grey                        2293
Nike Dunk Low Retro Championship Red                   2165
(W) Nike Dunk Low Triple White                         1939
Nike x Off-White Dunk Low The 50 - Lot 46              1792
Nike x Kasina Dunk Low Road Sign - Kasina Exclusive    1773
(W) Nike Dunk Low Next Nature Pale Coral

In [175]:
# csv 파일로 저장 

dataset.to_csv('data/nike_dunk_low(pre).csv')