# 1. 기초세팅

## 1-1) 필요 라이브러리 import

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 100)

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm # color map

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

import datetime
from datetime import datetime
import calendar

from scipy import stats
from scipy.stats import skew, norm, probplot, boxcox, kurtosis

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_score

import math


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## 1-2) 데이터 로드

+ Onlinesales_info : 2019년도 한해 동안 발생한 온라인 거래와 관련된 정보
+ Customer_info : 고객 관련 정보
+ Discount_info : 할인 관련 정보
+ Marketing_info : 마케팅 비용 관련 정보
+ Tax_info : 세금 관련 정보

In [2]:
online_sales = pd.read_csv('data/Onlinesales_info.csv')
discount_coupon = pd.read_csv('data/Discount_info.csv')
customer_data = pd.read_csv('data/customer_info.csv')
marketing_spend = pd.read_csv('data/marketing_info.csv')
tax_amount = pd.read_csv('data/Tax_info.csv')

## 2. EDA
### <p style="color: rgb(153, 204, 255);"> 2-1. 데이터 기본정보 확인</p>
+ 타입 확인
+ 갯수 확인
+ 중복값 확인
+ 결측치 확인
+ 결측치 비율 확인
+ 고유값 확인

In [3]:
def str_summary(df, pred=None):
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: [x.unique()]).T.squeeze() # 각 feature의 고유값을 시리즈 타입으로 반환
    nulls = df.apply(lambda x: x.isnull().sum())
    distincts = df.apply(lambda x: x.unique().shape[0]) # 각 feature의 고유값 개수
    missing_ratio = (df.isnull().sum() / obs) * 100
    
    print("Data shape: ", df.shape)

    cols = ['Types', 'Counts', 'Distincts', 'Nulls', 'Missing_ratio', 'Uniques']
    structure = pd.concat([types, counts, distincts, nulls, missing_ratio, uniques], axis=1, sort=True)

    structure.columns = cols

    print("================================================")
    print("Data types: ")
    print(structure['Types'].value_counts())
    print("================================================")
    print("\n\n")

    return structure

In [4]:
data_summary_1 = str_summary(online_sales)
data_summary_2 = str_summary(discount_coupon)
data_summary_3 = str_summary(customer_data)
data_summary_4 = str_summary(marketing_spend)
data_summary_5 = str_summary(tax_amount)

Data shape:  (52924, 9)
Data types: 
Types
object     6
float64    2
int64      1
Name: count, dtype: int64



Data shape:  (204, 4)
Data types: 
Types
object    3
int64     1
Name: count, dtype: int64



Data shape:  (1468, 4)
Data types: 
Types
object    3
int64     1
Name: count, dtype: int64



Data shape:  (365, 3)
Data types: 
Types
object     1
int64      1
float64    1
Name: count, dtype: int64



Data shape:  (20, 2)
Data types: 
Types
float64    1
object     1
Name: count, dtype: int64





In [5]:
data_summary_2

Unnamed: 0,Types,Counts,Distincts,Nulls,Missing_ratio,Uniques
월,object,204,12,0,0.0,"[Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec]"
제품카테고리,object,204,17,0,0.0,"[Apparel, Nest-USA, Office, Drinkware, Lifestyle, Bags, Notebooks, Headgear, Nest, Waze, Bottles..."
쿠폰코드,object,204,48,0,0.0,"[SALE10, SALE20, SALE30, ELEC10, ELEC20, ELEC30, OFF10, OFF20, OFF30, EXTRA10, EXTRA20, EXTRA30,..."
할인율,int64,204,3,0,0.0,"[10, 20, 30]"


5개 데이터셋 모두 결측치가 포함되어 있지 않다. 

online_sales 특이사항
+ [고객ID]는 중복값이 포함되어 있다. 즉, 한명의 고객이 여러번 구매했다.
+ [거래ID]는 중복값이 포함되어 있고, 하나의 거래 ID에 서로 다른 고객 ID가 포함되어 있다. 그리고 중복 거래ID는 모두 같은 날자에 발생하였다. 하지만 같은 [거래날짜]라고 [거래ID] 같은 것은 아니다. 하루 안에서 비슷한 시간대?에 구매가 이뤄졌거나 다른 이유가 있을 것 같다. (공구 같은 서비스가 있었나..?)
+ [쿠폰상태]의 Clicked는 쿠폰에 관심이 있어 클릭했지만 결제까지 연결되지 못한 데이터라고 판단됨.

In [6]:
online_sales['거래ID'].value_counts()

거래ID
Transaction_12261    35
Transaction_4716     30
Transaction_19047    29
Transaction_13487    28
Transaction_16759    27
                     ..
Transaction_10305     1
Transaction_10306     1
Transaction_10309     1
Transaction_10312     1
Transaction_25060     1
Name: count, Length: 25061, dtype: int64

In [7]:
# 같은 거래ID인 데이터 살펴보기
display(online_sales[online_sales['거래ID']=='Transaction_12261'].sample(10))
display(online_sales[online_sales['거래ID']=='Transaction_4716'].sample(10))
display(online_sales[online_sales['거래ID']=='Transaction_13487'].sample(10))

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태
26222,USER_0118,Transaction_12261,2019-07-13,Product_0169,Apparel,21,13.29,6.0,Used
26210,USER_0050,Transaction_12261,2019-07-13,Product_0155,Apparel,26,11.89,6.0,Clicked
26223,USER_0118,Transaction_12261,2019-07-13,Product_0170,Apparel,9,18.99,6.0,Not Used
26233,USER_1391,Transaction_12261,2019-07-13,Product_0729,Apparel,11,16.79,6.0,Used
26240,USER_1241,Transaction_12261,2019-07-13,Product_0747,Apparel,7,23.99,6.0,Clicked
26227,USER_0118,Transaction_12261,2019-07-13,Product_0723,Apparel,22,26.59,6.0,Used
26235,USER_1391,Transaction_12261,2019-07-13,Product_0734,Apparel,29,26.59,6.0,Used
26237,USER_1391,Transaction_12261,2019-07-13,Product_0739,Apparel,76,23.79,6.0,Used
26220,USER_0118,Transaction_12261,2019-07-13,Product_0167,Apparel,16,13.29,6.0,Clicked
26216,USER_0050,Transaction_12261,2019-07-13,Product_0163,Apparel,18,11.89,6.0,Clicked


Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태
9369,USER_0187,Transaction_4716,2019-03-15,Product_0914,Office,20,0.79,6.5,Clicked
9370,USER_0187,Transaction_4716,2019-03-15,Product_0915,Lifestyle,5,2.8,6.5,Used
9361,USER_0187,Transaction_4716,2019-03-15,Product_0880,Drinkware,10,2.39,6.5,Clicked
9367,USER_0187,Transaction_4716,2019-03-15,Product_0900,Drinkware,1,10.39,6.5,Clicked
9366,USER_0187,Transaction_4716,2019-03-15,Product_0894,Drinkware,5,2.39,6.5,Not Used
9380,USER_1178,Transaction_4716,2019-03-15,Product_0966,Office,15,1.59,6.5,Used
9377,USER_0187,Transaction_4716,2019-03-15,Product_0940,Office,20,0.79,6.5,Clicked
9373,USER_0187,Transaction_4716,2019-03-15,Product_0918,Lifestyle,5,2.8,6.5,Not Used
9375,USER_0187,Transaction_4716,2019-03-15,Product_0930,Office,5,1.59,6.5,Clicked
9355,USER_0187,Transaction_4716,2019-03-15,Product_0388,Apparel,1,15.99,6.5,Not Used


Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태
29486,USER_0350,Transaction_13487,2019-07-30,Product_0040,Apparel,2,6.0,6.0,Used
29506,USER_0205,Transaction_13487,2019-07-30,Product_0942,Office,20,2.5,6.0,Clicked
29492,USER_0312,Transaction_13487,2019-07-30,Product_0250,Apparel,3,6.0,6.0,Clicked
29494,USER_0312,Transaction_13487,2019-07-30,Product_0442,Apparel,2,5.1,6.0,Clicked
29505,USER_0312,Transaction_13487,2019-07-30,Product_0880,Drinkware,10,2.99,6.0,Used
29503,USER_0312,Transaction_13487,2019-07-30,Product_0843,Bags,14,13.99,6.0,Clicked
29483,USER_0350,Transaction_13487,2019-07-30,Product_0037,Apparel,2,6.0,6.0,Clicked
29482,USER_0350,Transaction_13487,2019-07-30,Product_0036,Apparel,2,6.0,6.0,Clicked
29497,USER_0312,Transaction_13487,2019-07-30,Product_0534,Apparel,2,5.7,6.0,Clicked
29491,USER_0312,Transaction_13487,2019-07-30,Product_0249,Apparel,3,6.0,6.0,Clicked


In [8]:
# 같은 날짜에 발생한 것들은 같은 거래ID를 가질까? 
online_sales[online_sales['거래날짜']=='2019-07-30']

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태
29437,USER_0350,Transaction_13466,2019-07-30,Product_0976,Nest-USA,3,119.00,6.00,Clicked
29438,USER_0350,Transaction_13466,2019-07-30,Product_0983,Nest-USA,2,119.00,6.00,Used
29439,USER_0350,Transaction_13467,2019-07-30,Product_0854,Bags,1,4.99,6.00,Not Used
29440,USER_0350,Transaction_13468,2019-07-30,Product_0983,Nest-USA,2,119.00,19.99,Clicked
29441,USER_0350,Transaction_13469,2019-07-30,Product_0984,Nest-USA,1,79.00,6.00,Not Used
...,...,...,...,...,...,...,...,...,...
29563,USER_1411,Transaction_13510,2019-07-30,Product_0448,Apparel,1,5.10,6.00,Clicked
29564,USER_1411,Transaction_13510,2019-07-30,Product_0536,Apparel,1,5.70,6.00,Used
29565,USER_1411,Transaction_13510,2019-07-30,Product_0537,Apparel,1,5.70,6.00,Clicked
29566,USER_1411,Transaction_13511,2019-07-30,Product_0981,Nest-USA,1,149.00,6.00,Used


## 2-2) 데이터 통합을 위한 전처리

In [9]:
df = online_sales.copy()

# customers_data 통합
df['거래날짜'] = pd.to_datetime(df['거래날짜']) # datetime으로 형 변환
df  = df.merge(customer_data, on='고객ID')

str_summary(df)

Data shape:  (52924, 12)
Data types: 
Types
object            7
int64             2
float64           2
datetime64[ns]    1
Name: count, dtype: int64





Unnamed: 0,Types,Counts,Distincts,Nulls,Missing_ratio,Uniques
가입기간,int64,52924,49,0,0.0,"[12, 43, 33, 30, 49, 32, 46, 24, 40, 10, 19, 14, 25, 50, 39, 21, 29, 26, 41, 28, 15, 18, 31, 27,..."
거래ID,object,52924,25061,0,0.0,"[Transaction_0000, Transaction_0001, Transaction_0002, Transaction_0003, Transaction_0004, Trans..."
거래날짜,datetime64[ns],52924,365,0,0.0,"[2019-01-01 00:00:00, 2019-01-02 00:00:00, 2019-01-03 00:00:00, 2019-01-04 00:00:00, 2019-01-05 ..."
고객ID,object,52924,1468,0,0.0,"[USER_1358, USER_0190, USER_0066, USER_0345, USER_0683, USER_0730, USER_0585, USER_1347, USER_07..."
고객지역,object,52924,5,0,0.0,"[Chicago, California, New York, New Jersey, Washington DC]"
배송료,float64,52924,267,0,0.0,"[6.5, 102.79, 28.78, 8.7, 20.0, 17.96, 24.47, 35.96, 18.47, 74.74, 35.3, 20.78, 13.78, 122.74, 1..."
성별,object,52924,2,0,0.0,"[남, 여]"
수량,int64,52924,151,0,0.0,"[1, 5, 15, 52, 31, 2, 3, 26, 10, 4, 6, 57, 103, 62, 12, 41, 30, 21, 206, 516, 14, 155, 34, 258, ..."
제품ID,object,52924,1145,0,0.0,"[Product_0981, Product_0904, Product_0203, Product_0848, Product_0854, Product_0880, Product_088..."
제품카테고리,object,52924,20,0,0.0,"[Nest-USA, Office, Apparel, Bags, Drinkware, Lifestyle, Notebooks & Journals, Headgear, Waze, Fu..."


In [10]:
str_summary(discount_coupon)

Data shape:  (204, 4)
Data types: 
Types
object    3
int64     1
Name: count, dtype: int64





Unnamed: 0,Types,Counts,Distincts,Nulls,Missing_ratio,Uniques
월,object,204,12,0,0.0,"[Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec]"
제품카테고리,object,204,17,0,0.0,"[Apparel, Nest-USA, Office, Drinkware, Lifestyle, Bags, Notebooks, Headgear, Nest, Waze, Bottles..."
쿠폰코드,object,204,48,0,0.0,"[SALE10, SALE20, SALE30, ELEC10, ELEC20, ELEC30, OFF10, OFF20, OFF30, EXTRA10, EXTRA20, EXTRA30,..."
할인율,int64,204,3,0,0.0,"[10, 20, 30]"


In [11]:
discount_coupon.sample(15)

Unnamed: 0,월,제품카테고리,쿠폰코드,할인율
90,Apr,Accessories,ACC10,10
26,Mar,Nest,NE30,30
49,Feb,Android,AND20,20
153,Oct,Apparel,SALE10,10
160,Nov,Office,OFF20,20
61,May,Drinkware,EXTRA20,20
131,Sep,Waze,WEMP30,30
88,May,Housewares,HOU20,20
62,Jun,Drinkware,EXTRA30,30
39,Jan,Accessories,ACC10,10


In [12]:
# discount_data 통합
df['월'] = df['거래날짜'].dt.month
month_dict = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12,
}

discount_coupon['월'] = discount_coupon['월'].map(month_dict)
df = df.merge(discount_coupon, on=['월', '제품카테고리'], how='left')

In [13]:
df.sample(10)

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태,성별,고객지역,가입기간,월,쿠폰코드,할인율
13797,USER_0864,Transaction_6645,2019-04-16,Product_0273,Apparel,1,15.99,6.0,Not Used,남,Chicago,4,4,SALE10,10.0
41214,USER_1121,Transaction_18432,2019-10-07,Product_0922,Apparel,1,10.99,6.0,Clicked,여,New Jersey,37,10,SALE10,10.0
36116,USER_0664,Transaction_16061,2019-09-01,Product_1034,Apparel,1,15.19,6.0,Used,남,Chicago,10,9,SALE30,30.0
12301,USER_0124,Transaction_6008,2019-04-06,Product_0179,Drinkware,50,9.79,6.5,Not Used,여,Chicago,22,4,EXTRA10,10.0
47056,USER_0970,Transaction_21542,2019-11-22,Product_0969,Office,2,4.79,6.5,Clicked,남,Washington DC,6,11,OFF20,20.0
22759,USER_1432,Transaction_10669,2019-06-17,Product_0722,Apparel,1,30.39,6.0,Not Used,여,Chicago,38,6,SALE30,30.0
21175,USER_0675,Transaction_9954,2019-06-07,Product_0880,Drinkware,16,2.99,6.0,Clicked,남,California,7,6,EXTRA30,30.0
32901,USER_0675,Transaction_14826,2019-08-16,Product_0961,Notebooks & Journals,5,11.99,12.48,Used,남,California,7,8,NJ20,20.0
30401,USER_0046,Transaction_13821,2019-08-03,Product_0981,Nest-USA,1,149.0,6.0,Not Used,남,Chicago,8,8,ELEC20,20.0
13171,USER_0483,Transaction_6351,2019-04-12,Product_0184,Office,2,1.99,6.5,Used,여,Washington DC,18,4,OFF10,10.0


In [14]:
df.columns, tax_amount.columns

(Index(['고객ID', '거래ID', '거래날짜', '제품ID', '제품카테고리', '수량', '평균금액', '배송료', '쿠폰상태',
        '성별', '고객지역', '가입기간', '월', '쿠폰코드', '할인율'],
       dtype='object'),
 Index(['제품카테고리', 'GST'], dtype='object'))

In [15]:
# tax_amount 통합
df = df.merge(tax_amount, on='제품카테고리', how='outer')

In [16]:
df.sample(10)

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태,성별,고객지역,가입기간,월,쿠폰코드,할인율,GST
12917,USER_0558,Transaction_15373,2019-08-23,Product_0152,Apparel,1,19.19,6.0,Used,여,Chicago,30,8,SALE20,20.0,0.18
36421,USER_0193,Transaction_8992,2019-05-21,Product_0981,Nest-USA,1,149.0,6.0,Not Used,여,Chicago,10,5,ELEC20,20.0,0.1
30421,USER_1218,Transaction_23847,2019-12-15,Product_0994,Nest,1,355.74,6.5,Clicked,여,California,44,12,NE30,30.0,0.05
2229,USER_0234,Transaction_3893,2019-03-02,Product_0470,Apparel,3,29.99,6.5,Used,남,Chicago,25,3,SALE30,30.0,0.18
15590,USER_0358,Transaction_18762,2019-10-11,Product_0360,Apparel,1,37.49,19.99,Clicked,남,California,37,10,SALE10,10.0,0.18
41824,USER_0544,Transaction_19457,2019-10-22,Product_0984,Nest-USA,1,79.0,12.99,Clicked,남,California,14,10,ELEC10,10.0,0.1
7971,USER_0591,Transaction_10700,2019-06-18,Product_0141,Apparel,1,13.59,6.0,Clicked,남,New York,45,6,SALE30,30.0,0.18
29008,USER_0425,Transaction_18175,2019-10-03,Product_0995,Nest,1,279.0,6.0,Clicked,남,Washington DC,39,10,NE10,10.0,0.05
12409,USER_0012,Transaction_14944,2019-08-17,Product_0228,Apparel,1,4.56,6.0,Used,여,Chicago,39,8,SALE20,20.0,0.18
36867,USER_0755,Transaction_9769,2019-06-03,Product_0976,Nest-USA,1,119.0,6.0,Not Used,남,Washington DC,24,6,ELEC30,30.0,0.1


In [17]:
df['할인율'].isnull().sum() # 이걸 하는 이유는 left 조인을 했기 때문. 

400

In [19]:
df['쿠폰코드'].isnull().sum()

400

할인은 쿠폰을 사용해야 받을 수 있다. 너무 당연한 사실..

In [18]:
df[df['할인율'].isnull()]

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태,성별,고객지역,가입기간,월,쿠폰코드,할인율,GST
18403,USER_0585,Transaction_0046,2019-01-02,Product_0863,Backpacks,1,103.15,6.50,Clicked,여,New York,46,1,,,0.10
18404,USER_1358,Transaction_1109,2019-01-17,Product_0863,Backpacks,1,82.52,13.78,Not Used,남,Chicago,12,1,,,0.10
18405,USER_0499,Transaction_1983,2019-01-30,Product_0863,Backpacks,1,82.52,20.74,Not Used,남,New Jersey,4,1,,,0.10
18406,USER_1104,Transaction_2173,2019-02-02,Product_0861,Backpacks,1,99.99,6.50,Clicked,남,Chicago,12,2,,,0.10
18407,USER_1104,Transaction_2173,2019-02-02,Product_0863,Backpacks,1,99.99,6.50,Used,남,Chicago,12,2,,,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28575,USER_1374,Transaction_8818,2019-05-19,Product_0850,More Bags,1,21.99,6.00,Used,여,Chicago,27,5,,,0.18
28576,USER_0578,Transaction_9014,2019-05-22,Product_0850,More Bags,1,17.59,6.00,Clicked,여,California,30,5,,,0.18
28577,USER_0994,Transaction_9049,2019-05-22,Product_0850,More Bags,1,17.59,51.99,Used,여,New York,46,5,,,0.18
28578,USER_0539,Transaction_9083,2019-05-23,Product_0850,More Bags,1,21.99,6.00,Clicked,여,New York,15,5,,,0.18


In [21]:
# 쿠폰상태가 used인 상태에만 할인적용율
df.loc[df['쿠폰상태'] != 'Used', '할인율'] = 0.0

In [41]:
df[df['할인율'].isnull()==True]['할인율'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['할인율'].isnull()==True]['할인율'].fillna(0, inplace=True)


In [44]:
df['할인율'] = df['할인율'].fillna(0.0)

In [29]:
df[df.isna().any(axis=1)] # 어디든 Null이 존재하는 데이터만 살펴보기. 

Unnamed: 0,고객ID,거래ID,거래날짜,제품ID,제품카테고리,수량,평균금액,배송료,쿠폰상태,성별,고객지역,가입기간,월,쿠폰코드,할인율,GST
18403,USER_0585,Transaction_0046,2019-01-02,Product_0863,Backpacks,1,103.15,6.50,Clicked,여,New York,46,1,,0.0,0.10
18404,USER_1358,Transaction_1109,2019-01-17,Product_0863,Backpacks,1,82.52,13.78,Not Used,남,Chicago,12,1,,0.0,0.10
18405,USER_0499,Transaction_1983,2019-01-30,Product_0863,Backpacks,1,82.52,20.74,Not Used,남,New Jersey,4,1,,0.0,0.10
18406,USER_1104,Transaction_2173,2019-02-02,Product_0861,Backpacks,1,99.99,6.50,Clicked,남,Chicago,12,2,,0.0,0.10
18407,USER_1104,Transaction_2173,2019-02-02,Product_0863,Backpacks,1,99.99,6.50,Used,남,Chicago,12,2,,,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28575,USER_1374,Transaction_8818,2019-05-19,Product_0850,More Bags,1,21.99,6.00,Used,여,Chicago,27,5,,,0.18
28576,USER_0578,Transaction_9014,2019-05-22,Product_0850,More Bags,1,17.59,6.00,Clicked,여,California,30,5,,0.0,0.18
28577,USER_0994,Transaction_9049,2019-05-22,Product_0850,More Bags,1,17.59,51.99,Used,여,New York,46,5,,,0.18
28578,USER_0539,Transaction_9083,2019-05-23,Product_0850,More Bags,1,21.99,6.00,Clicked,여,New York,15,5,,0.0,0.18


In [45]:
df[df.isna().any(axis=1)].apply(lambda x: x.isnull().sum())

고객ID        0
거래ID        0
거래날짜        0
제품ID        0
제품카테고리      0
수량          0
평균금액        0
배송료         0
쿠폰상태        0
성별          0
고객지역        0
가입기간        0
월           0
쿠폰코드      400
할인율         0
GST         0
dtype: int64

In [46]:
df['쿠폰코드'] = df['쿠폰코드'].fillna('NULL')

In [47]:
df.isnull().sum()

고객ID      0
거래ID      0
거래날짜      0
제품ID      0
제품카테고리    0
수량        0
평균금액      0
배송료       0
쿠폰상태      0
성별        0
고객지역      0
가입기간      0
월         0
쿠폰코드      0
할인율       0
GST       0
dtype: int64