In [2]:
import pandas as pd
import numpy as np
import altair as alt

alt.data_transformers.enable('data_server')

# Wrangling

whitewine = pd.read_csv('Data/winequality-white.csv', sep=';')
redwine = pd.read_csv('Data/winequality-red.csv', sep=';')

whitewine["type"] = "white"
redwine["type"] = "red"

wine = redwine.append(whitewine)
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [52]:
# Correlation Graph

# Get correlations for each wine type
corr_df_white = wine.loc[wine['type'] == 'white'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_white["type"] = "white"

corr_df_red = wine.loc[wine['type'] == 'red'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_red["type"] = "red"

# Bind them together
corr_df = corr_df_white.append(corr_df_red)

corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0 #Remove full correlations on diag
corr_df['abs'] = corr_df['corr'].abs()


# Make chart
click = alt.selection_multi(fields=['type'], bind='legend') 
alt.Chart(corr_df,title="Correlation Plot for Numeric Features").mark_square().encode(
    color=alt.Color('type', scale=alt.Scale(domain=['red', 'white'],
            range=['darkred', 'blue'])),
    x='level_0',
    y='level_1',
    size='abs',
    opacity=alt.condition(click, alt.value(0.7), alt.value(0)),
    tooltip=["type", "corr"]
).configure_title(fontSize=18).properties(height=250, width=250).add_selection(click)

In [65]:
# Scatterplot
click = alt.selection_multi(fields=['type'], bind='legend')
chart = alt.Chart(wine).mark_point().encode(
    alt.X("alcohol"),
    alt.Y("sulphates"),
    alt.Color("type"),
    opacity = alt.condition(click, alt.value(0.7), alt.value(0))
)

regression = chart.transform_regression('alcohol','sulphates', groupby = ["type"],
                                        # By default lines don't go beyond data and are hard to read in this dense dataset
                                       extent = [min(wine['alcohol']) - 1, max(wine['alcohol']) + 1]).mark_line(size = 5)

(chart + regression).add_selection(click)

In [54]:
corr_df["level_0"].unique()

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'], dtype=object)