In [3]:
import pandas as pd, numpy as np

In [5]:
## Import raw data

df_red = pd.read_csv('winequality-red.csv', sep = ';')
df_white = pd.read_csv('winequality-white.csv', sep = ';')

In [6]:
## Check length of raw data

df_red['wine'] = 'red'
df_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  wine                  1599 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 162.5+ KB


In [7]:
## Check length of raw data

df_white['wine'] = 'white'
df_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
 12  wine                  4898 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 497.6+ KB


In [9]:
## Merging data frames
wine_quality = pd.concat([df_red, df_white])

## Adding final relative quality column
wine_quality['final_quality'] = wine_quality['quality'].apply(lambda x: 'Below Average' if x < 6 
                                                              else ('Above Average' if x > 6 else 'Average'))

## Reorganizing columns
cols = ['wine','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','final_quality']
wine_quality = wine_quality[cols]
wine_quality.columns = map(str.title, wine_quality.columns)

## Check length of new data (6497 = 4898 + 1599 )
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  6497 non-null   object 
 1   Fixed Acidity         6497 non-null   float64
 2   Volatile Acidity      6497 non-null   float64
 3   Citric Acid           6497 non-null   float64
 4   Residual Sugar        6497 non-null   float64
 5   Chlorides             6497 non-null   float64
 6   Free Sulfur Dioxide   6497 non-null   float64
 7   Total Sulfur Dioxide  6497 non-null   float64
 8   Density               6497 non-null   float64
 9   Ph                    6497 non-null   float64
 10  Sulphates             6497 non-null   float64
 11  Alcohol               6497 non-null   float64
 12  Quality               6497 non-null   int64  
 13  Final_Quality         6497 non-null   object 
dtypes: float64(11), int64(1), object(2)
memory usage: 761.4+ KB


In [10]:
## Last check on df structure
wine_quality.sample(8)

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,Ph,Sulphates,Alcohol,Quality,Final_Quality
1524,white,6.6,0.21,0.49,18.15,0.042,41.0,158.0,0.9997,3.28,0.39,8.7,6,Average
3207,white,6.8,0.21,0.42,1.2,0.045,24.0,126.0,0.99234,3.09,0.87,10.9,6,Average
298,red,7.2,0.65,0.02,2.3,0.094,5.0,31.0,0.9993,3.67,0.8,9.7,5,Below Average
974,red,8.8,0.33,0.41,5.9,0.073,7.0,13.0,0.99658,3.3,0.62,12.1,7,Above Average
664,red,12.1,0.4,0.52,2.0,0.092,15.0,54.0,1.0,3.03,0.66,10.2,5,Below Average
3320,white,7.5,0.29,0.36,15.7,0.05,29.0,124.0,0.9968,3.06,0.54,10.4,5,Below Average
1539,white,7.3,0.26,0.49,5.0,0.028,32.0,107.0,0.9936,3.24,0.54,10.8,6,Average
4639,white,6.9,0.54,0.26,12.7,0.049,59.0,195.0,0.99596,3.26,0.54,10.5,6,Average


In [11]:
## Export
wine_quality.to_csv('wine_quality.csv', index=False)

In [12]:
## Test import
pd.read_csv('wine_quality.csv')

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,Ph,Sulphates,Alcohol,Quality,Final_Quality
0,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
1,red,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,Below Average
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,Below Average
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,Average
4,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,white,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,Average
6493,white,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,Below Average
6494,white,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,Average
6495,white,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,Above Average
