In [13]:
import pandas as pd
import numpy as np
import altair as alt

alt.data_transformers.enable('data_server')

# Wrangling

whitewine = pd.read_csv('Data/raw/winequality-red.csv', sep=';')
redwine = pd.read_csv('Data/raw/winequality-white.csv', sep=';')

whitewine["type"] = "white"
redwine["type"] = "red"

wine = redwine.append(whitewine)
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,red
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,red
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,red
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,red
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,white
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,white
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,white
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,white


In [11]:
pwd

'/Users/yuxuancui/Desktop/MDS/data551/WineVison'

In [14]:
# Correlation Graph

# Get correlations for each wine type
corr_df_white = wine.loc[wine['type'] == 'white'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_white["type"] = "white"

corr_df_red = wine.loc[wine['type'] == 'red'].select_dtypes('number').corr('spearman').stack().reset_index(name='corr')
corr_df_red["type"] = "red"

# Bind them together
corr_df = corr_df_white.append(corr_df_red)

corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0 #Remove full correlations on diag
corr_df['abs'] = corr_df['corr'].abs()


# Make chart
click = alt.selection_multi(fields=['type'], bind='legend') 
alt.Chart(corr_df,title="Correlation Plot for Numeric Features").mark_square().encode(
    color=alt.Color('type', scale=alt.Scale(domain=['red', 'white'],
            range=['darkred', 'blue'])),
    x='level_0',
    y='level_1',
    size='abs',
    opacity=alt.condition(click, alt.value(0.7), alt.value(0)),
    tooltip=["type", "corr"]
).configure_title(fontSize=18).properties(height=250, width=250).add_selection(click)

In [15]:
# Scatterplot
click = alt.selection_multi(fields=['type'], bind='legend')
chart = alt.Chart(wine).mark_point().encode(
    alt.X("alcohol"),
    alt.Y("sulphates"),
    alt.Color("type"),
    opacity = alt.condition(click, alt.value(0.7), alt.value(0))
)

regression = chart.transform_regression('alcohol','sulphates', groupby = ["type"],
                                        # By default lines don't go beyond data and are hard to read in this dense dataset
                                       extent = [min(wine['alcohol']) - 1, max(wine['alcohol']) + 1]).mark_line(size = 5)

(chart + regression).add_selection(click)

In [16]:
corr_df["level_0"].unique()

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'], dtype=object)