In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


from scipy.stats import skew , kurtosis
import statsmodels.api as sm
from scipy.stats import shapiro # Used to check if dataset is skewed or not 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE # To check weather the dataset is good enough for making clusters 

In [2]:
warnings.filterwarnings('ignore')

# Options for pandas : 
pd.set_option('display.max_rows' , 200)
pd.set_option('display.max_columns' , 50)
pd.set_option('display.width' , None)


In [3]:
# Lets read the dataset and do some basic EDA like check mean median mode etc
df_red  = pd.read_csv('winequality-red.csv')


In [4]:
df_red.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


we can see all the columns are cramped up in a single column and before doing any analysis we need to seperate the columns. 

In [5]:
base_column_name  = 'fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'

df_red['fixed_acidity'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[0])
df_red['volatile_acidity'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[1])
df_red['citric_acid'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[2])
df_red['residual_sugar'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[3])
df_red['chlorides'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[4])
df_red['free_sulfur_dioxide'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[5])
df_red['total_sulfur_dioxide'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[6])
df_red['density'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[7])
df_red['ph'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[8])
df_red['sulphates'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[9])
df_red['alcohol'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[10])
df_red['quality'] = df_red[base_column_name].apply(lambda x : str(x).split(";")[11])

df_red.head()


Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality""",fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [6]:
df = df_red.drop(base_column_name, axis=1)

In [7]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   fixed_acidity         1599 non-null   object
 1   volatile_acidity      1599 non-null   object
 2   citric_acid           1599 non-null   object
 3   residual_sugar        1599 non-null   object
 4   chlorides             1599 non-null   object
 5   free_sulfur_dioxide   1599 non-null   object
 6   total_sulfur_dioxide  1599 non-null   object
 7   density               1599 non-null   object
 8   ph                    1599 non-null   object
 9   sulphates             1599 non-null   object
 10  alcohol               1599 non-null   object
 11  quality               1599 non-null   object
dtypes: object(12)
memory usage: 150.0+ KB


The values are object type and we would be needing to change them back to the float values.

In [9]:
column_names  = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'ph', 'sulphates', 'alcohol', 'quality']


for name in column_names:
    df[name] = df[name].astype(float)


# we can also use the function as pd.tonumeric(df[name] , erros  =coerce) 


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   ph                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   float64
dtypes: float64(12)
memory usage: 150.0 KB


In [11]:
df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [12]:
df.shape

(1599, 12)

In [13]:
df.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
ph                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [14]:
df.duplicated().sum()

np.int64(240)