In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# for color brewer palette also need to
#  pip install ipywidgets
#  jupyter extension enable --py widgetsnbextension

In [None]:
sns.__version__

In [None]:
# Vehicle data - https://www.fueleconomy.gov/feg/download.shtml
# Documentation https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
# Datasets for All Model Years (1984–2018)

fueleco = pd.read_csv('../data/vehicles.csv.zip')

In [None]:
fueleco

In [None]:
fueleco.dtypes

In [None]:
# Let's look at categorical columns
# drive, make, model, trany, VClass, 
# drive - drive axle type
# make - manufacturer (division)
# model - model name (carline)
# trany - transmission
# VClass - EPA vehicle size class
fueleco.select_dtypes(include=["object"]).T

In [None]:
# cylinders could be considered categorical
# year as well
# barrels08 - annual petroleum consumption in barrels for fuelType1 (1)
# city08 - city MPG for fuelType1 (2), (11)
# comb08 - combined MPG for fuelType1 (2), (11)
# cylinders - engine cylinders
# highway08 - highway MPG for fuelType1 (2), (11)
# year - model year


fueleco.select_dtypes([int, float]).T

In [None]:
cols = 'drive,make,model,trany,VClass,barrels08,city08,comb08,cylinders,highway08,year'
auto = fueleco[cols.split(',')]

In [None]:
auto.T

## 1D - Categorical

In [None]:
# 1D - Categorical
sns.countplot(x='make', data=auto)

In [None]:
# 1D - Categorical
# Bump size
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='make', data=auto, ax=ax)

In [None]:
auto.make.value_counts()

In [None]:
avc = auto.make.value_counts()
top10 = avc.index[:10]
sns.countplot(y='make2',
    data=auto.assign(make2=auto.make.where(auto.make.isin(top10), 'Other'))
             )

In [None]:
# 1D - Categorical
# Can add order
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='make', data=auto, ax=ax, 
              order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
# 1D - Categorical
# Can add hue option to embed small 2nd dimension
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x='make', data=auto, ax=ax, 
              order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
             hue='year')

## 1D Exercise
* Plot a 1D plot for categorical data


## 1D - Continuous Plots

In [None]:
# 1D - Continuous
sns.displot(auto.city08)

In [None]:
# 1D - Continuous - add "rug"
sns.displot(auto.city08, rug=True)

In [None]:
# 1D - Continuous
# Be sure to check out the documentation for various options
# 
sns.displot(auto.city08, 
            stat='density', # option to sns.histplot
            bins=12)

In [None]:
# what about plotting the city/comb/highway?
# Need to melt data
(pd.melt(auto, value_vars=['city08', 'comb08', 'highway08'], 
         value_name='mpg', var_name='type', id_vars=['year'])
)

In [None]:
sns.displot(data=(pd.melt(auto, value_vars=['city08', 'comb08', 'highway08'],
                          value_name='mpg', var_name='type', id_vars=['year'])),
            x='mpg', hue='type')

In [None]:
sns.displot(data=(pd.melt(auto, value_vars=['city08', 'comb08', 'highway08'], value_name='mpg', var_name='type', id_vars=['year'])
                 .query('mpg < 40')),
            x='mpg', hue='type', aspect=2, bins=30)

In [None]:
sns.displot(data=(pd.melt(auto, value_vars=['city08', 'comb08', 'highway08'],value_name='mpg', var_name='type', id_vars=['year'])
                 .query('mpg < 40')),
            kind='kde', fill=True, x='mpg', hue='type', aspect=2)

In [None]:
sns.displot(data=(pd.melt(auto, value_vars=['city08', 'comb08', 'highway08'], value_name='mpg', var_name='type', id_vars=['year'])
                 .query('mpg < 40')),
            kind='kde', fill=True, x='mpg', hue='type', aspect=2,
           col='year', col_wrap=2, col_order=[1985, 1990,1995,2000,2005,2010],
           palette='viridis')

In [None]:
# If you just want KDE plot can use kdeplot
# plot multiple on same ax
fig, ax = plt.subplots(figsize=(7,5))
sns.kdeplot(auto.city08, ax=ax, fill=True)
sns.kdeplot(auto.comb08, ax=ax, fill=True)
sns.kdeplot(auto.highway08, ax=ax, fill=True)
ax.set_xlim(0, 50)

## 1D Exercise

* Plot a 1D plot for continuous data

## 2D - Cont-Cont
We can use ``relplot``, ``lmplot``, and ``jointplot``

In [None]:
sns.relplot(x='city08', y='comb08', data=auto)

In [None]:
# Note return type (can use ax, better to use height/aspect)
fig, ax = plt.subplots(figsize=(8,6))
sns.relplot(x='city08', y='comb08', data=auto, alpha=.5, ax=ax)

In [None]:
# Note return type (can use ax, better to use height/aspect)
sns.relplot(x='city08', y='comb08', data=auto, alpha=.5,
            height=5, aspect=2)

In [None]:
# Can add multiple dimensions with size (cont), hue (cat)
sns.relplot(x='city08', y='highway08', data=auto, alpha=.5,
            height=5, aspect=1.6, size='cylinders', hue='year')

In [None]:
# Can add multiple dimensions with size (cont), hue (cat)
# bump up size with sizes (min/max)
# Can also facet with row/col
sns.relplot(x='city08', y='highway08', data=auto, alpha=.5,
            height=5, aspect=1, #size='cylinders', 
            col='cylinders', col_wrap=3,
            hue='year')

In [None]:
# use lmplot to draw regression
sns.lmplot(x='city08', y='comb08', data=auto)

In [None]:
# use lmplot to draw regression
sns.lmplot(x='city08', y='comb08', data=auto, hue='year', hue_order=[1985, 1990, 1995, 2000, 2005, 2010])

In [None]:
# use lmplot to draw regression
sns.lmplot(x='city08', y='comb08', data=auto, col='year', col_order=[1985, 1990, 1995, 2000, 2005, 2010], col_wrap=2)

In [None]:
# use lmplot to draw regression
sns.lmplot(x='year', y='comb08', data=auto[auto.year > 2005])

In [None]:
# use lmplot to draw regression
# can add jitter (height 0.9 param, use size in 0.8)
sns.lmplot(x='year', y='comb08', data=auto[auto.year > 2005],
           x_jitter=.3,
           height=4
          )

In [None]:
# use lmplot to draw regression
# can add jitter
# use hue/row/col to add more dimensions
sns.lmplot(x='year', y='comb08', data=auto[(auto.year > 2005) &
                    (auto.make.isin(['Ford', 'Toyota']))],
          x_jitter=.2, size=8, 
           col='make')

In [None]:
# All the continuous values
# (Careful can take some time!)
sns.pairplot(auto.sample(1_000))

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(auto.corr(), annot=True, cmap='RdBu', 
            annot_kws={'size':20}, ax=ax,
            vmin=-1, vmax=1)

In [None]:
# use figsize for clustermap
sns.clustermap(auto.corr(), annot=True, cmap='RdBu', 
               annot_kws={'size':12},
               figsize=(6,6),
               vmin=-1, vmax=1)

## 2D Cont-Cont Exercise
* Plot a 2D plot of two continuous variables
* Plot a pairplot with the data (or try a portion first)

## 2D Cat-Cont

In [None]:
sns.catplot(x='city08', y='make', data=auto)

In [None]:
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
           kind='box')

In [None]:
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
           kind='violin')

In [None]:
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
           kind='boxen')

In [None]:
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
           kind='strip')

In [None]:
# add hue for a new dimension
sns.catplot(x='city08', y='make', data=auto, order=['Chevrolet', 'Ford'],# 'Dodge', 'GMC', 'Toyota'],
           kind='boxen', hue='cylinders', palette='viridis', aspect=2)

## 2D Cat-Cont Exercise:
* Using the ``catplot`` function, create two plots of your data using two different kinds of plots. Which plot kind works better for your plot? Why did you choose it?

## Cat-Cat
@randyzwitch I don't really like stacked bar charts, I'd suggest maybe using pointplot / factorplot with kind=point

— Michael Waskom (@michaelwaskom) September 4, 2014

Status 2018 - ``pointplot`` requires a continuous variable. ``factorplot`` - deprecated in favor of ``catplot``

In [None]:
pd.crosstab(auto.VClass, auto.make)

In [None]:
# For count plots can only specify x or y (not both)
# Put one category in x|y the other in hue
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])
sns.catplot(y='make', hue='VClass', 
            data=auto[mask].sample(100), kind='count', size=10)

In [None]:
# stacked with pandas
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

(
    auto[mask]
    .sample(100)
    .groupby(['VClass', 'make'])
    .size()
    .unstack()
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# above is simpler with crosstab
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])
(
    pd.crosstab(auto[mask].VClass, auto[mask].make)
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# normalized stacked with pandas
makes = ['Chevrolet', 'Toyota', 'Ford', 'Dodge', 'GMC']
mask = auto.make.isin(makes)
(
    auto.loc[mask]
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .pipe(lambda df: df.div(df.sum(1), axis=0))
    .sort_values('Chevrolet', ascending=False) # highest to lowest
    [makes]   # this changes the stack order
    .plot(kind='bar', stacked=True, cmap='Set3')
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# not stacked with pandas
fig, ax = plt.subplots(figsize=(8,6))
(
    auto[mask]
    .sample(100)
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .plot(kind='bar', stacked=False, ax=ax)
)

In [None]:
# stacked with pandas
(
    auto[mask]
    .sample(100, random_state=42)
    .pipe(lambda df: pd.crosstab(df.make, df.VClass))
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# comparing categorical with target (Classification)
# stacked with pandas
ford_mask = auto.make == 'Ford'
toyota_mask = auto.make == 'Toyota'
(
    auto[ford_mask | toyota_mask]
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .pipe(lambda df: df.div(df.sum(1), axis=0))
    .sort_values('Ford', ascending=False)
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

## Cat-cat Exercise
* Plot a two category columns against each other

## Time series (date-cont)

In [None]:
# lineplot in 0.9
# code in pandas for 0.8
auto.groupby('year')['city08'].mean().plot()

In [None]:
# lineplot in 0.9
# code in pandas for 0.8
auto.groupby('year')['city08'].agg(['min', 'mean',  'median']).plot()

In [None]:
# lineplot is 0.9
sns.lineplot(x='year', y='city08', data=auto)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
sns.lineplot(x='year', y='city08', data=auto, ax=ax)

In [None]:
# can use hue, style, size to add dimensions
fig, ax = plt.subplots(figsize=(8,6))
sns.lineplot(x='year', y='city08', size='VClass', data=auto, ax=ax)
ax.legend(bbox_to_anchor=(1,1))

use relplot (kind='line') to facet by col/row

In [None]:
# can use hue, style, size to add dimensions
fig, ax = plt.subplots(figsize=(8,6))
data = auto.groupby(['year', 'VClass'])['city08'].mean().unstack()
colors = ['#8888' if c != 'Midsize Cars' else '#900' for c in data.columns]
data.plot(ax=ax, color=colors)
ax.legend(bbox_to_anchor=(1,1))
ax.set_title('Average City Mileage')

## Timeseries Exercise
* Plot a timeseries plot
* Add an extra dimension to it

## Seaborn Extras

In [None]:
# default style
sns.set() 

In [None]:
plt.plot(range(10))

In [None]:
# styles - white, dark, whitegrid, darkgrid, ticks
# use set_style for more permanence
with sns.axes_style('dark'):
    plt.plot(range(10))

In [None]:
with sns.axes_style('ticks'):
    plt.plot(range(10))
    sns.despine()

In [None]:
sns.set_style('ticks')
plt.plot(range(10))

In [None]:
# get the current style - can pass in a dictionary like this as 2nd param to *_style
sns.axes_style()

In [None]:
# list my fonts
import matplotlib.font_manager
print(sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist]))

In [None]:
sns.set()
sns.set_style('ticks' , {'font.sans-serif': ['Iosevka Fixed SS01']})
with sns.plotting_context('poster'):
    plt.plot(range(10))

In [None]:
sns.set()
with sns.plotting_context('poster'):
    plt.plot(range(10))

In [None]:
s = {'axes.facecolor': 'pink'}
with sns.axes_style(None, s):
    plt.plot(range(10))

In [None]:
# reset to default values
sns.set()

In [None]:
plt.plot(range(10))

In [None]:
# paper, notebook, talk, poster
# set_context - permanent
# plotting_context - with statement
with sns.plotting_context('poster'):
    plt.plot(range(10))

In [None]:
# colors
# set_palette - permanent
# color_palette - context manager
# categorical (qualitative) - no order (deep, muted, pastel, bright, dark, and colorblind)
sns.palplot(sns.color_palette())

In [None]:
# "Circular" - For large numbers of categories or repeating (season/time)
# HSLuv - equal luminance
sns.palplot(sns.color_palette('husl', 30))

In [None]:
# continuous (sequential) - Ordered  - unimportant to more important
sns.palplot(sns.color_palette('Blues'))

In [None]:
# continuous (sequential) - Ordered - unimportant to more important
# Reverse order by tacking _r onto end
sns.palplot(sns.color_palette('Blues_r'))

In [None]:
# continuous (diverging) - Ordered - High and low are interesting (correlation)
sns.palplot(sns.color_palette('RdBu'))

In [None]:
# fancy tool to help (see docs for 1st param)
# also other sns.choose_*
# diverging, sequential, qualitative
sns.choose_colorbrewer_palette('qualitative')

In [None]:
# Can specify own colors
bad = ['#c07fef', '#deadbe', '#fef70c']
sns.palplot(sns.color_palette(bad))

In [None]:
# need to set font outside of context manager!
sns.set_style('ticks' , {'font.sans-serif': ['Roboto']})

with sns.color_palette(sns.color_palette(bad)):
    with sns.plotting_context('talk'):
        sns.violinplot(x='city08', y='make', data=auto,
                 order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.color_palette?

## Extras Exercise
* With the ``plotting_context`` change the size of one of your previous plots
* Create your own color palette for one of your plots. What type should it be (diverging, sequential, qualitative)?
* With the ``color_palette`` use your palette to update the plot