In [2]:
import pandas as pd, numpy as np

In [3]:
## Import raw data

df_red = pd.read_csv('winequality-red.csv', sep = ';')
df_white = pd.read_csv('winequality-white.csv', sep = ';')

In [4]:
## Check length of raw data

df_red['wine'] = 'red'
df_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  wine                  1599 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 162.5+ KB


In [5]:
## Check length of raw data

df_white['wine'] = 'white'
df_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
 12  wine                  4898 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 497.6+ KB


In [13]:
## Merging data frames
wine_quality = pd.concat([df_red, df_white])

## Removing White wine outliers
wine_quality = wine_quality.loc[wine_quality['density'] < 1.004, ]

## Adding final relative quality column
wine_quality['quality_factor'] = wine_quality['quality'].apply(lambda x: 'Below Average' if x < 6 
                                                               else ('Above Average' if x > 6 else 'Average'))

## Reorganizing columns
cols = ['wine','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','quality_factor']
wine_quality = wine_quality[cols]
wine_quality.columns = map(str.title, wine_quality.columns)
wine_quality = wine_quality.rename(columns={'Ph': 'pH'})

## Check length of new data (6497 = 4898 + 1599 )
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6494 entries, 0 to 4897
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  6494 non-null   object 
 1   Fixed Acidity         6494 non-null   float64
 2   Volatile Acidity      6494 non-null   float64
 3   Citric Acid           6494 non-null   float64
 4   Residual Sugar        6494 non-null   float64
 5   Chlorides             6494 non-null   float64
 6   Free Sulfur Dioxide   6494 non-null   float64
 7   Total Sulfur Dioxide  6494 non-null   float64
 8   Density               6494 non-null   float64
 9   pH                    6494 non-null   float64
 10  Sulphates             6494 non-null   float64
 11  Alcohol               6494 non-null   float64
 12  Quality               6494 non-null   int64  
 13  Quality_Factor        6494 non-null   object 
dtypes: float64(11), int64(1), object(2)
memory usage: 761.0+ KB


In [14]:
## Last check on df structure
wine_quality.sample(8)

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Quality,Quality_Factor
1060,white,8.0,0.26,0.36,2.0,0.054,30.0,121.0,0.992,3.09,0.72,11.6,7,Above Average
592,red,9.0,0.45,0.49,2.6,0.084,21.0,75.0,0.9987,3.35,0.57,9.7,5,Below Average
1006,red,9.1,0.29,0.33,2.05,0.063,13.0,27.0,0.99516,3.26,0.84,11.7,7,Above Average
262,red,8.0,0.52,0.03,1.7,0.07,10.0,35.0,0.99575,3.34,0.57,10.0,5,Below Average
556,white,8.4,0.35,0.71,12.2,0.046,22.0,160.0,0.9982,2.98,0.65,9.4,5,Below Average
4257,white,5.8,0.32,0.23,1.5,0.033,39.0,121.0,0.9887,2.96,0.35,12.0,5,Below Average
4488,white,6.7,0.48,0.49,2.9,0.03,28.0,122.0,0.98926,3.13,0.4,13.0,6,Average
391,white,6.5,0.36,0.32,1.1,0.031,13.0,66.0,0.9916,3.1,0.46,10.6,5,Below Average


In [15]:
## Export
wine_quality.to_csv('wine_quality.csv', index=False)

In [16]:
## Test import
pd.read_csv('wine_quality.csv')

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Quality,Quality_Factor
0,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
1,red,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,Below Average
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,Below Average
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,Average
4,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6489,white,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,Average
6490,white,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,Below Average
6491,white,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,Average
6492,white,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,Above Average
