In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./Data/winequality-white.csv', delimiter=';')

In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


Viewing data types of each column.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


- variable types are fine.
- *quality* does not need to be converted to one-hot encoding since it is y.

Viewing counts per unique value of *quality*.

In [14]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

Could consider simplying the values of quality to the following:
- range {3,4} = 1 (low quality)
- range {5,6,7} = 2 (medium quality)
- range {8,9} = 3 (good quality)

In [15]:
df_mod = df.copy()
df_mod['quality'].replace([3,4,5,6,7,8,9],[1,1,2,2,2,3,3], inplace=True)

In [16]:
df_mod['quality'].value_counts()

2    4535
1     183
3     180
Name: quality, dtype: int64

The quality scores are normalized into a range of {1,2,3} now.

In [17]:
X_cols, y_col = [col for col in df_mod.columns if col != 'quality'], 'quality'

In [18]:
X, y = df_mod[X_cols], df_mod[y_col]

In [19]:
X.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [20]:
def standardize_data(X, y, test_size=0.30, SEED=0) -> tuple:
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    z = StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)
    Xz_train, Xz_test = z.fit_transform(X_train), z.transform(X_test)
    return (Xz_train, Xz_test, y_train, y_test)


In [21]:
Xz_train, Xz_test, y_train, y_test = standardize_data(X, y)

In [22]:
# showing x values are in z score form now
Xz_train[0]

array([-0.54987946,  0.10802059,  0.90421574, -0.54833003,  6.81327372,
        2.32630491,  1.04952979,  0.07318938, -1.17885395, -0.95161186,
       -1.16015744])