## 와인 품질 데이터
- 와인 성분 변수들을 이용하여 품질을 예측하는 유명한 데이터 <br> <br>

##### 사족
- pd.set_option을 설정하면 한번에 보이는 로우와 칼럼수를 무제한으로 바꿀 수 있습니다
- 칼럼이 중간에 끊겨 보이는 현상을 예방할 수 있습니다
- 그렇지만 데이터가 크면 다 보여주려다가 주피터 노트북이 뻑날 수 있으니 head()를 사용하는 습관 필수

In [39]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings(action='ignore')

#### 데이터 불러오기 및 앞부분 확인

In [20]:
data = pd.read_csv('wine.csv')
data.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


#### 데이터 shape 확인

In [21]:
data.shape

(5497, 14)

#### 컬럼명 확인

In [12]:
data.columns

Index(['index', 'quality', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'type'],
      dtype='object')

#### NaN, 데이터 타입, 개수 확인

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 5497 non-null   int64  
 1   quality               5497 non-null   int64  
 2   fixed acidity         5497 non-null   float64
 3   volatile acidity      5497 non-null   float64
 4   citric acid           5497 non-null   float64
 5   residual sugar        5497 non-null   float64
 6   chlorides             5497 non-null   float64
 7   free sulfur dioxide   5497 non-null   float64
 8   total sulfur dioxide  5497 non-null   float64
 9   density               5497 non-null   float64
 10  pH                    5497 non-null   float64
 11  sulphates             5497 non-null   float64
 12  alcohol               5497 non-null   float64
 13  type                  5497 non-null   object 
dtypes: float64(11), int64(2), object(1)
memory usage: 601.4+ KB


#### 통계값 확인

In [18]:
data.describe()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0
mean,2748.0,5.818992,7.210115,0.338163,0.318543,5.438075,0.055808,30.417682,115.566491,0.994673,3.219502,0.530524,10.504918
std,1586.991546,0.870311,1.287579,0.163224,0.145104,4.756676,0.034653,17.673881,56.288223,0.003014,0.160713,0.149396,1.194524
min,0.0,3.0,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.74,0.22,8.0
25%,1374.0,5.0,6.4,0.23,0.25,1.8,0.038,17.0,78.0,0.9923,3.11,0.43,9.5
50%,2748.0,6.0,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.9948,3.21,0.51,10.3
75%,4122.0,6.0,7.7,0.4,0.39,8.1,0.064,41.0,155.0,0.99693,3.32,0.6,11.3
max,5496.0,9.0,15.9,1.58,1.66,65.8,0.61,289.0,440.0,1.03898,4.01,2.0,14.9


#### quality의 unique 확인

In [22]:
data.quality.unique()

array([5, 6, 7, 8, 4, 3, 9])

#### quality별로 개수 확인

In [28]:
data.quality.value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

#### 중요하다고 생각되는 부분만 추출

In [41]:
data_sample = data[['quality','residual sugar','pH','alcohol','type']]
data_sample.head()

Unnamed: 0,quality,residual sugar,pH,alcohol,type
0,5,6.8,3.44,10.2,white
1,5,2.4,3.19,9.5,red
2,5,2.0,3.05,10.9,white
3,6,6.0,3.26,10.8,white
4,6,9.5,3.04,10.9,white


#### groupby를 통해 quality로 묶고 통계값(min,mean,max) 확인

In [42]:
data_sample.groupby("quality").agg((["min","mean","max"]))

Unnamed: 0_level_0,residual sugar,residual sugar,residual sugar,pH,pH,pH,alcohol,alcohol,alcohol
Unnamed: 0_level_1,min,mean,max,min,mean,max,min,mean,max
quality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
3,1.15,5.203846,16.2,2.87,3.250385,3.63,8.0,10.251923,12.6
4,0.7,4.14543,15.4,2.74,3.228602,3.9,8.6,10.180376,13.5
5,0.6,5.792422,23.5,2.79,3.213384,3.79,8.0,9.846875,14.9
6,0.7,5.535141,65.8,2.74,3.219611,4.01,8.4,10.606015,14.0
7,0.9,4.748918,19.25,2.84,3.228333,3.82,8.6,11.383864,14.2
8,0.8,5.581579,14.8,2.94,3.216711,3.57,8.8,11.680921,14.0
9,1.6,4.12,10.6,3.2,3.308,3.41,10.4,12.18,12.9


#### groupby를 통해 quality/type로 묶고 개수 세기

In [75]:
data_sample.groupby(["quality","type"]).size()

quality  type 
3        red         9
         white      17
4        red        41
         white     145
5        red       571
         white    1217
6        red       535
         white    1881
7        red       169
         white     755
8        red        13
         white     139
9        white       5
dtype: int64

#### 알코올 함량을 기준으로 오름차순 정리 (20개)

In [43]:
data_sample.sort_values('alcohol').head(20)

Unnamed: 0,quality,residual sugar,pH,alcohol,type
4254,3,5.1,3.42,8.0,white
5077,5,0.95,3.34,8.0,white
1768,3,2.1,3.16,8.4,red
4176,5,3.3,3.16,8.4,white
479,6,1.8,2.86,8.4,red
3725,5,20.15,3.01,8.5,white
3772,6,18.0,2.98,8.5,white
172,6,18.0,2.98,8.5,white
255,5,1.6,3.15,8.5,red
3062,5,9.1,3.37,8.5,white


#### 알코올 함량을 기준으로 내림차순 정리 (20개)

In [44]:
data_sample.sort_values('alcohol',ascending=False).head(20)

Unnamed: 0,quality,residual sugar,pH,alcohol,type
2787,5,7.5,2.98,14.9,red
5340,7,1.6,3.12,14.2,white
4964,7,8.4,3.26,14.05,white
4059,6,1.8,3.68,14.0,red
3005,7,1.9,3.21,14.0,white
4150,7,2.6,3.39,14.0,white
1963,6,2.6,3.32,14.0,red
984,8,1.2,3.33,14.0,white
876,7,2.1,3.71,14.0,red
2142,6,1.8,3.68,14.0,red


#### quailty가 7 이상인 데이터 추출 (20개)

In [45]:
data_sample[data_sample['quality']>=7].head(20)

Unnamed: 0,quality,residual sugar,pH,alcohol,type
11,7,1.1,3.32,10.9,white
12,7,6.85,3.03,11.9,white
17,7,4.8,3.17,12.2,white
25,8,3.9,3.24,12.6,white
45,7,1.4,3.22,11.5,white
46,7,12.8,3.14,9.1,white
53,7,1.8,3.08,13.7,white
54,8,6.1,3.08,12.5,white
55,7,14.6,3.21,8.6,white
60,7,3.6,3.31,13.1,white


#### alcohol을 기준으로 10도가 넘으면 'strong' 아니면 'weak'으로 표시 (lambda 쓰세요)

In [46]:
data_sample['alcohol_intensity'] = data_sample['alcohol'].apply(lambda x:'strong'if x>10 else 'weak')




data_sample.head()

Unnamed: 0,quality,residual sugar,pH,alcohol,type,alcohol_intensity
0,5,6.8,3.44,10.2,white,strong
1,5,2.4,3.19,9.5,red,weak
2,5,2.0,3.05,10.9,white,strong
3,6,6.0,3.26,10.8,white,strong
4,6,9.5,3.04,10.9,white,strong


#### merge 실습을 위해 데이터를 인위적으로 나눠볼게요

In [60]:
df1 = data[data.quality.isin([5, 6, 7])][['quality','fixed acidity', 'citric acid']]
df2 = data[data.quality.isin([7, 8, 9])][['quality','volatile acidity', 'chlorides', 'alcohol']]

In [61]:
df1.head()

Unnamed: 0,quality,fixed acidity,citric acid
0,5,5.6,0.06
1,5,8.8,0.14
2,5,7.9,0.39
3,6,7.0,0.31
4,6,7.8,0.26


In [62]:
df2.head()

Unnamed: 0,quality,volatile acidity,chlorides,alcohol
11,7,0.16,0.057,10.9
12,7,0.33,0.038,11.9
17,7,0.28,0.029,12.2
25,8,0.29,0.027,12.6
45,7,0.305,0.047,11.5


#### quality를 기준으로 inner join

In [67]:
df3 = pd.merge(df1,df2,how='inner',on='quality')
df3.head(10)

Unnamed: 0,quality,fixed acidity,citric acid,volatile acidity,chlorides,alcohol
0,7,6.2,0.33,0.16,0.057,10.9
1,7,6.2,0.33,0.33,0.038,11.9
2,7,6.2,0.33,0.28,0.029,12.2
3,7,6.2,0.33,0.305,0.047,11.5
4,7,6.2,0.33,0.19,0.053,9.1
5,7,6.2,0.33,0.29,0.036,13.7
6,7,6.2,0.33,0.22,0.044,8.6
7,7,6.2,0.33,0.29,0.026,13.1
8,7,6.2,0.33,0.28,0.093,12.4
9,7,6.2,0.33,0.1,0.041,10.3


#### quality를 기준으로 outer join

In [70]:
df4 = pd.merge(df1,df2,how='outer',on='quality')
df4.head()

Unnamed: 0,quality,fixed acidity,citric acid,volatile acidity,chlorides,alcohol
0,5,5.6,0.06,,,
1,5,8.8,0.14,,,
2,5,7.9,0.39,,,
3,5,6.1,0.49,,,
4,5,6.8,0.31,,,


### 끝

(넘파이/판다스 처음 해본 신입기수들에게)<br>
사실 이 과제 자체보단 넘파이/판다스에서 중요한 건<br>
실습 시간에 했던 코드를 "직접 손으로 쳐보면서 익숙해지는 것"이라고 생각해요<br>
그래서 실습 코드를 베끼면 쉽게 답을 찾을 수 있도록 과제를 냈으니까<br>
이 과제 자체에 연연하기 보다는 직접 손으로 쳐보면서 익숙해지기를 바랍니다<br>
눈으로 보고 shift + enter로 결과 내면서 실습을 지나치면<br>
이 뒤의 수업들도 얻어가는 게 적지 않을까 싶습니다<br><br>

코딩은 눈으로 하는 게 아니라 손으로 하는 거니까요<n>
그러니까 어떤 실습이든, 교재든, 강의든<n>
코드문을 직접 손으로 하나 하나 베껴가면서 학습하길 바라요<br> <br>

- 곧 5학년이 될 죄인 15기가,,,^^
    ＠>>----☆(~.^)/