In [1]:
import pandas as pd, numpy as np

In [2]:
## Import raw data

df_red = pd.read_csv('raw/winequality-red.csv', sep = ';')
df_white = pd.read_csv('raw/winequality-white.csv', sep = ';')

In [3]:
## Check length of raw data

df_red['wine'] = 'red'
df_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  wine                  1599 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 162.5+ KB


In [4]:
## Check length of raw data

df_white['wine'] = 'white'
df_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
 12  wine                  4898 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 497.6+ KB


In [5]:
## Merging data frames
wine_quality = pd.concat([df_red, df_white])

## Adding final relative quality column
wine_quality['final_quality'] = wine_quality['quality'].apply(lambda x: 'Below Average' if x < 6 
                                                              else ('Above Average' if x > 6 else 'Average'))

## Reorganizing columns
cols = ['wine','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','final_quality']
wine_quality = wine_quality[cols]

## Check length of new data (6497 = 4898 + 1599 )
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   wine                  6497 non-null   object 
 1   fixed acidity         6497 non-null   float64
 2   volatile acidity      6497 non-null   float64
 3   citric acid           6497 non-null   float64
 4   residual sugar        6497 non-null   float64
 5   chlorides             6497 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6497 non-null   float64
 10  sulphates             6497 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
 13  final_quality         6497 non-null   object 
dtypes: float64(11), int64(1), object(2)
memory usage: 761.4+ KB


In [6]:
## Last check on df structure
wine_quality.sample(8)

Unnamed: 0,wine,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,final_quality
1390,white,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6,Average
311,white,5.0,0.55,0.14,8.3,0.032,35.0,164.0,0.9918,3.53,0.51,12.5,8,Above Average
4846,white,5.1,0.3,0.3,2.3,0.048,40.0,150.0,0.98944,3.29,0.46,12.2,6,Average
1874,white,7.9,0.29,0.31,7.35,0.034,37.0,154.0,0.9938,3.06,0.31,10.8,5,Below Average
2261,white,6.5,0.36,0.31,4.1,0.061,20.0,134.0,0.99475,3.18,0.45,9.0,6,Average
4808,white,7.3,0.35,0.67,8.3,0.053,10.0,100.0,0.9959,3.19,0.5,10.9,5,Below Average
1251,white,5.7,0.245,0.33,1.1,0.049,28.0,150.0,0.9927,3.13,0.42,9.3,5,Below Average
934,red,6.6,0.61,0.01,1.9,0.08,8.0,25.0,0.99746,3.69,0.73,10.5,5,Below Average


In [7]:
## Export
wine_quality.to_csv('wine_quality.csv', index=False)

In [8]:
## Test import
pd.read_csv('wine_quality.csv')

Unnamed: 0,wine,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,final_quality
0,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
1,red,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,Below Average
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,Below Average
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,Average
4,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,white,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,Average
6493,white,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,Below Average
6494,white,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,Average
6495,white,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,Above Average
