In [1]:
import pandas as pd, numpy as np

In [2]:
## Import raw data

df_red = pd.read_csv('winequality-red.csv', sep = ';')
df_white = pd.read_csv('winequality-white.csv', sep = ';')

In [3]:
## Check length of raw data

df_red['wine'] = 'red'
df_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  wine                  1599 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 162.5+ KB


In [4]:
## Check length of raw data

df_white['wine'] = 'white'
df_white.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
 12  wine                  4898 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 497.6+ KB


In [8]:
## Merging data frames
wine_quality = pd.concat([df_red, df_white])

## Adding final relative quality column
wine_quality['quality factor'] = wine_quality['quality'].apply(lambda x: 'Below Average' if x < 6 
                                                              else ('Above Average' if x > 6 else 'Average'))

## Reorganizing columns
cols = ['wine','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality','quality factor']
wine_quality = wine_quality[cols]
wine_quality.columns = map(str.title, wine_quality.columns)
wine_quality = wine_quality.rename(columns={"Ph": "pH"})

## Check length of new data (6497 = 4898 + 1599 )
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  6497 non-null   object 
 1   Fixed Acidity         6497 non-null   float64
 2   Volatile Acidity      6497 non-null   float64
 3   Citric Acid           6497 non-null   float64
 4   Residual Sugar        6497 non-null   float64
 5   Chlorides             6497 non-null   float64
 6   Free Sulfur Dioxide   6497 non-null   float64
 7   Total Sulfur Dioxide  6497 non-null   float64
 8   Density               6497 non-null   float64
 9   pH                    6497 non-null   float64
 10  Sulphates             6497 non-null   float64
 11  Alcohol               6497 non-null   float64
 12  Quality               6497 non-null   int64  
 13  Quality Factor        6497 non-null   object 
dtypes: float64(11), int64(1), object(2)
memory usage: 761.4+ KB


In [9]:
## Last check on df structure
wine_quality.sample(8)

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Quality,Quality Factor
3811,white,7.2,0.28,0.38,2.0,0.052,23.0,156.0,0.9912,3.13,0.52,11.1,5,Below Average
2262,white,8.5,0.25,0.31,2.8,0.032,11.0,61.0,0.99189,3.06,0.44,11.5,6,Average
1896,white,7.5,0.29,0.67,8.1,0.037,53.0,166.0,0.9966,2.9,0.41,8.9,6,Average
4705,white,5.7,0.27,0.16,9.0,0.053,32.0,111.0,0.99474,3.36,0.37,10.4,6,Average
377,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7,Above Average
106,red,7.8,0.41,0.68,1.7,0.467,18.0,69.0,0.9973,3.08,1.31,9.3,5,Below Average
2355,white,7.6,0.48,0.33,7.0,0.024,14.0,130.0,0.9918,3.25,0.45,12.5,7,Above Average
572,red,10.2,0.24,0.49,2.4,0.075,10.0,28.0,0.9978,3.14,0.61,10.4,5,Below Average


In [16]:
## Export
wine_quality.to_csv('wine_quality.csv', index=False)

In [17]:
## Test import
pd.read_csv('wine_quality.csv')

Unnamed: 0,Wine,Fixed Acidity,Volatile Acidity,Citric Acid,Residual Sugar,Chlorides,Free Sulfur Dioxide,Total Sulfur Dioxide,Density,pH,Sulphates,Alcohol,Quality,Quality Factor
0,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
1,red,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,Below Average
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,Below Average
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,Average
4,red,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,Below Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,white,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,Average
6493,white,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,Below Average
6494,white,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,Average
6495,white,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,Above Average


In [10]:
# Data wrangling for correlation plot
wine = wine_quality #This is the naming convention I had in my work and didn't want to change it
# Get correlations for each wine type
corr_df_white = wine.loc[wine['Wine'] == 'white'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_white["Wine"] = "white"

corr_df_red = wine.loc[wine['Wine'] == 'red'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_red["Wine"] = "red"

# Bind them together
corr_df = corr_df_white.append(corr_df_red)
corr_df["Quality Factor"] = 3 # For all qualities

# Subset by quality and for each and bind
for i in [0,1,2]:
    #Create white df at ith quality
    corr_df_white = wine.loc[(wine['Wine'] == 'white') & (wine["Quality Factor"] == i)].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
    corr_df_white["Wine"] = "white"
    corr_df_white["Quality Factor"] = i
    #create red df at ith quality
    corr_df_red = wine.loc[(wine['Wine'] == 'red') & (wine["Quality Factor"] == i)].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
    corr_df_red["Wine"] = "red"
    corr_df_red["Quality Factor"] = i
    # bind to main df
    corr_df = corr_df.append(corr_df_red)
    corr_df = corr_df.append(corr_df_white)

In [13]:
corr_df.sample(10)

Unnamed: 0,level_0,level_1,corr,Wine,Quality Factor
113,Sulphates,Free Sulfur Dioxide,0.052252,white,3
2,Fixed Acidity,Citric Acid,0.297878,white,3
91,Density,Density,1.0,white,3
19,Volatile Acidity,Density,0.025014,red,3
114,Sulphates,Total Sulfur Dioxide,0.157825,white,3
117,Sulphates,Sulphates,1.0,white,3
3,Fixed Acidity,Residual Sugar,0.106725,white,3
141,Quality,Sulphates,0.37706,red,3
52,Chlorides,Chlorides,1.0,white,3
63,Free Sulfur Dioxide,Residual Sugar,0.346107,white,3


In [15]:
corr_df.to_csv('correlation.csv', index=False)