## Import packages and get path

In [22]:
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import missingno as msno
! pip install ptitprince
import ptitprince as pt

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import data, inspect shape, info, and points

In [5]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [6]:
df.shape

In [7]:
df.head()

In [8]:
df.describe().T

## Inspect for null values

In [9]:
msno.matrix(df)
df.info()

## Check for duplicate values

In [10]:
df.duplicated().sum()

There are 240 identical datapoints, we will evaluate if these should be removed later

## Visualize distributions w/ histograms

In [11]:
df.hist(figsize=(20, 20))
plt.show()

# Histogram insights:
- Most values appear to be normal distributions
- Fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, and alcohol appear to be right-skewed

## Visualize relationships w/ heatmap

In [12]:
corr_mat = df.corr()
mask = np.array(corr_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=corr_mat, mask=mask, square=True, annot=True, cbar=True)

In [13]:
# Sort correlations and view top/bottom
# Return a corr matrix of just the lower triangle, excluding the diagonal
low_corr_mat = corr_mat.where(
    ~np.triu(np.ones(corr_mat.shape), k=0).astype(bool))
sorted_mat = low_corr_mat.unstack().sort_values()

sorted_mat.dropna()

### Heatmap insights:
- Positive correlations:
    - fixed acidity & density
    - fixed acidity & citric acid
    - free sulfur dioxide & total sulfur dioxide
- Negative correlations:
    - fixed acidity & pH
    - volatile acidity & citric acid
    - citric acid & pH
    - density & alcohol

In [14]:
# View correlations on just the target variable
target_corr_mat = corr_mat["quality"]
target_corr_mat

### Correlations to the target variable are not particularly strong

## Visualize distributions with raincloud plots

In [36]:
x = "quality"
ort = "h"
pal = "Set2"
for y in ["total sulfur dioxide", "residual sugar", "citric acid"]:

    f, ax = plt.subplots(figsize=(15, 10))

    pt.RainCloud(x=x, y=y, data=df, palette=pal,
                 width_viol=.7, ax=ax, orient=ort)
    plt.title(x + " vs. " + y)


## Lastly, visualize distributions and relationships with a pairplot

In [35]:
sns.pairplot(df)

# This concludes the exploratory data analysis
Next: see predictive modelling notebook

-Turner Luke  
Thanks for viewing, keep in touch  
https://www.linkedin.com/in/turnermluke/  
https://github.com/turnerluke  
https://www.kaggle.com/turnerluke