Red Wine Quality Classification
-----------------------

# 1. get dataset

In [13]:
import pandas as pd

dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = pd.read_csv(dataset_url, sep=';')# special seperator

In [14]:
df.shape

(1599, 12)

In [15]:
df.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


# 2. check dataset

In [17]:
# check if null exist in dataset
df.isnull().any()
# no null data

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [20]:
# Detect missing values.
df.isna().any()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [22]:
print(df['quality'].value_counts())

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64


**imbalanace data**

# 3. RandomOverSampler for the imbalance data

In [24]:
from imblearn.over_sampling import RandomOverSampler
# create dataset
x_old_data = df.iloc[:,:-1].values  
y_old_data = df['quality'].values

In [27]:
from collections import Counter
Counter(y_old_data)# y data count, the data is imbalance

Counter({3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18})

In [28]:
ros = RandomOverSampler() 
x_new_data, y_new_data = ros.fit_sample(x_old_data, y_old_data)
from collections import Counter
Counter(y_new_data)# y data count, the data is balance

Counter({3: 681, 4: 681, 5: 681, 6: 681, 7: 681, 8: 681})

# 4. create train&test dataset

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_new_data, y_new_data, test_size=0.2,random_state=0)

In [30]:
x_train.shape, x_test.shape

((3268, 11), (818, 11))

In [33]:
from collections import Counter

Counter( y_train ), Counter( y_test )

(Counter({3: 555, 4: 554, 5: 531, 6: 536, 7: 549, 8: 543}),
 Counter({3: 126, 4: 127, 5: 150, 6: 145, 7: 132, 8: 138}))

# 5. RandomForestClassifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)



0.8887530562347188

# ref

* https://zhuanlan.zhihu.com/p/32553213