# 과제 제출
* 해당 결과물은 pdf 와 ipynb 코드 파일로 제출
* pdf 로 다운로드 방법 : 파일 > 인쇄 > 저장
    * 코드의 출력물이 보이는 상태로 다운로드하기
* ipynb로 다운로드 방법 : 파일 > 다운로드 > ipynb 다운로드

# 과제 목표
* 머신러닝 모델로 모델링 및 예측값 도출
    * wine 데이터셋에서 red 와인인지 white 와인인지 예측는 모델 설계 및 예측값을 구한다.

# 필요한 package import

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

In [None]:
import warnings  #ignore warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split # train, test 분류 
from sklearn.tree import DecisionTreeClassifier # 의사결정나무
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀

# 분석할 dataset 불러오기

와인 측정 데이터
* 포르투갈 서북쪽의 대서양을 맞닿고 위치한 비뉴 베르드(Vinho Verde) 지방에서 만들어진 와인을 측정한 데이터
* 레드와인 1,599개를 등급과 맛, 산도를 측정해 분석하고, 화이트 와인 샘플 4,898개를 동일방식으로 분석한 데이터

데이터 설명
* fixed acidity: 주석산 농도
* volatile acidity: 아세트산 농도 
* citric acid': 구연산 농도
* residual sugar: 잔류 당분 농도
* chlorides : 염화나트륨 농도
* free sulfur dioxide : 유리 아황산 농도
* total sulfur dioxide : 총 아황산 농도
* density : 밀도
* pH : pH
* sulphates : 황산칼륨 농도
* alcohol : 알코올 도수
* quality : 와인등급

분석 목적
* 데이터에서 red. white 와인을 판단할 수 있는 모델을 구축하려고 한다. 

In [None]:
# 레드 와인데이터셋 불러오기
red = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
# 화이트 와인데이터셋 불러오기
white = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [None]:
red.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5


In [None]:
white.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6


In [None]:
red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


# red, white 와인 데이터셋 합치기
* 각 dataset에 'wine'이라는 column을 추가
* 각 dataset의 'wine' columns에는 'red', 'white'를 추가함

In [None]:
# 목표변수 할당
red['wine'] = 0
white['wine'] = 1

In [None]:
df = pd.concat([red, white], axis = 0)
df.info()
# 데이터 수 6497인데 index가 0 to 4897으로 중복된 index 번호가 있음

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  wine                  6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 710.6 KB


In [None]:
df.index = range(0, len(df.index),1)
df.index

RangeIndex(start=0, stop=6497, step=1)

In [None]:
df.columns = df.columns.str.replace(' ','_')
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


* 일반적으로 와인 퀄리티가 와인의 종류를 구분하지 않기 때문에 quality 변수 제거


In [None]:
df = df.drop(['quality'], axis = 1)
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


# data split

In [None]:
seed = 1004
train, test = train_test_split(df, test_size=0.3, random_state = seed)

In [None]:
test.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,wine
4443,8.0,0.22,0.31,5.6,0.049,24.0,97.0,0.993,3.1,0.42,10.9,1
1563,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1,0
3555,7.2,0.15,0.33,1.1,0.027,16.0,63.0,0.9937,3.37,0.4,9.9,1
4073,6.1,0.27,0.31,1.5,0.035,17.0,83.0,0.99076,3.32,0.44,11.1,1
6052,6.6,0.38,0.29,2.9,0.035,15.0,101.0,0.98916,3.04,0.37,12.5,1


In [None]:
train_x = train.drop(columns=['wine'])
train_y = train['wine']

In [None]:
train_x.head(2)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
3821,8.0,0.45,0.28,10.8,0.051,25.0,157.0,0.9957,3.06,0.47,11.4
3117,8.4,0.23,0.49,7.8,0.035,22.0,95.0,0.9935,3.04,0.34,12.0


In [None]:
train_y.head(2)

3821    1
3117    1
Name: wine, dtype: int64

In [None]:
test_x = test.drop(columns=['wine'])
test_y = test['wine']

# modeling (LogisticRegression)

In [None]:
lr = LogisticRegression()
lr.fit(train_x, train_y)

LogisticRegression()

In [None]:
lr.score(train_x, train_y)

0.9804266549373213

In [None]:
lr.score(test_x, test_y)

0.9769230769230769

# modeling (DecisionTree)

In [None]:
tr = DecisionTreeClassifier()
tr.fit(train_x, train_y)

DecisionTreeClassifier()

In [None]:
tr.score(train_x, train_y)

0.9997800747745766

In [None]:
tr.score(test_x, test_y)

0.9830769230769231

# modeling(RandomForest)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20, max_depth=5,random_state=0)
rf.fit(train_x,train_y)

RandomForestClassifier(max_depth=5, n_estimators=20, random_state=0)

In [None]:
rf.score(train_x, train_y)

0.9940620189135694

In [None]:
rf.score(test_x, test_y)

0.9887179487179487

# 예측값 구하기

In [None]:
predict_lr = lr.predict(test_x)
predict_lr

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
predict_tr = tr.predict(test_x)
predict_tr

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
predict_rf = rf.predict(test_x)
predict_rf

array([1, 0, 1, ..., 1, 1, 1])

- 예측값

In [None]:
test_x_lr = test_x.copy()
test_x_tr = test_x.copy()
test_x_rf = test_x.copy()

In [None]:
test_x_lr['wine'] = predict_lr
test_x_tr.head(3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
4443,8.0,0.22,0.31,5.6,0.049,24.0,97.0,0.993,3.1,0.42,10.9
1563,7.2,0.695,0.13,2.0,0.076,12.0,20.0,0.99546,3.29,0.54,10.1
3555,7.2,0.15,0.33,1.1,0.027,16.0,63.0,0.9937,3.37,0.4,9.9


In [None]:
test_x_tr['wine'] = predict_tr
test_x_rf['wine'] = predict_rf

In [None]:
from sklearn.metrics import accuracy_score # 정확도 함수

# 예측값 결과 비교 확인

In [None]:
print('LogisticRegression : ',accuracy_score(test_y,predict_lr))
print()
print('DecisionTree : ',accuracy_score(test_y,predict_tr))
print()
print('RandomForest : ',accuracy_score(test_y,predict_rf))

LogisticRegression :  0.9769230769230769

DecisionTree :  0.9830769230769231

RandomForest :  0.9887179487179487


### 결과 : RandomForest가 가장 정확도가 높음