# 와인 분석

In [None]:
import os
import numpy as np
import pandas as pd
from plt_rcs import *
import hds

In [None]:
df = pd.read_csv('https://bit.ly/WhiteWine_Data', sep=';')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe().round(3)

In [None]:
df['quality'].value_counts().sort_index()
# quality
# 3      20
# 4     163
# 5    1457
# 6    2198
# 7     880
# 8     175
# 9       5
# Name: count, dtype: int64

In [None]:
df.columns

In [None]:
# 서열형 -> 명목형
df['grade'] = np.where(df['quality'].ge(7), 1, 0)

In [None]:
# 불필요한 컬럼 제거
df = df.drop(columns=['free sulfur dioxide', 'quality'])

In [None]:
df.head()

## 목표변수 도수 확인

In [None]:
plt.rc(group='figure', figsize=(4, 4))

In [None]:
hds.plot.bar_freq(data=df, x='grade')

## 연속형 변수 간 상관관계 확인

In [None]:
hds.plot.corr_heatmap(data=df, fontsize=7)

## 입력 변수와 관계 확인

In [None]:
for var_name in df.columns.to_list()[-1]:
    hds.plot.box_group(data=df, x='grade', y=var_name, palette=['skyblue', 'orange'])
    plt.show()

## 특성 행렬과 타겟 벡터로 분리

In [None]:
yvar = 'grade'
X = df.drop(columns=yvar)
y = df[yvar].copy()
display(X)
display(y)

## 실습 데이터셋 분할

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape
# (3428, 10)
X_valid.shape
# (1470, 10)

In [None]:
# 타겟 데이터의 상대도수 확인
y_train.value_counts(normalize=True).sort_index()
# grade
# 0    0.782089
# 1    0.217911
# Name: proportion, dtype: float64
y_valid.value_counts(normalize=True).sort_index()
# grade
# 0    0.787075
# 1    0.212925
# Name: proportion, dtype: float64

## 외부 파일로 저장

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
objs = {
    'X_train': X_train,
    'X_valid': X_valid,
    'y_train': y_train,
    'y_valid': y_valid
}

In [None]:
pd.to_pickle(obj=objs, filepath_or_buffer='WhiteWine.pkl')