In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Vehicle data - https://www.fueleconomy.gov/feg/download.shtml
# Documentation https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
# Datasets for All Model Years (1984–2018)

fueleco = pd.read_csv('../data/vehicles.csv.zip')

In [None]:
fueleco

In [None]:
fueleco.dtypes

In [None]:
# Let's look at categorical columns
# drive, make, model, trany, VClass, 
# drive - drive axle type
# make - manufacturer (division)
# model - model name (carline)
# trany - transmission
# VClass - EPA vehicle size class
fueleco.select_dtypes(include=["object"]).T

In [None]:
# cylinders could be considered categorical
# year as well
# barrels08 - annual petroleum consumption in barrels for fuelType1 (1)
# city08 - city MPG for fuelType1 (2), (11)
# comb08 - combined MPG for fuelType1 (2), (11)
# cylinders - engine cylinders
# highway08 - highway MPG for fuelType1 (2), (11)
# year - model year


fueleco.select_dtypes([int, float]).T

In [None]:
cols = 'drive,make,model,trany,VClass,barrels08,city08,comb08,cylinders,highway08,year'
auto = fueleco[cols.split(',')]

In [None]:
auto

## 1D - Categorical

In [None]:
# 1D - Categorical
sns.countplot(x='make', data=auto)

In [None]:
auto.make.value_counts()

In [None]:
# 1D - Categorical
# Bump size
fig, ax = plt.subplots(figsize=(10,8))
sns.countplot(x='make', data=auto, ax=ax)

In [None]:
auto.make.value_counts()

In [None]:
avc = auto.make.value_counts()
top10 = avc.index[:10]
sns.countplot(y='make2',
    data=auto.assign(make2=auto.make.where(auto.make.isin(top10), 'Other'))
             )

In [None]:
# 1D - Categorical
# Can add order
fig, ax = plt.subplots(figsize=(10,8))
sns.countplot(x='make', data=auto, ax=ax, 
              order=top10)

In [None]:
# 1D - Categorical
# Can add order
fig, ax = plt.subplots(figsize=(6,4))
sns.countplot(y='make', data=auto, ax=ax, 
              order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
# 1D - Categorical
# Can add hue option to embed small 2nd dimension
fig, ax = plt.subplots(figsize=(6,4))
sns.countplot(x='make', data=auto, ax=ax, 
              order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
             hue='year', hue_order=[1985, 1990, 1995, 2000])

In [None]:
auto

## 1D Exercise
* Plot a 1D plot for categorical data


## 1D - Continuous Plots

In [None]:
# 1D - Continuous
sns.distplot(auto.city08)

In [None]:
# 1D - Continuous
fig, ax = plt.subplots(figsize=(6,4))
sns.distplot(auto.city08, ax=ax, rug=True)

In [None]:
# 1D - Continuous
# Be sure to check out the documentation for various options
# 
fig, ax = plt.subplots(figsize=(10,8))
sns.distplot(auto.city08, ax=ax, #rug=True, 
             norm_hist=True, bins=12)

In [None]:
# If you just want KDE plot can use kdeplot
# plot multiple on same ax
fig, ax = plt.subplots(figsize=(7,5))
sns.kdeplot(auto.city08, ax=ax, shade=True)
sns.kdeplot(auto.comb08, ax=ax, shade=True)
sns.kdeplot(auto.highway08, ax=ax, shade=True)
ax.set_xlim(0, 50)

## 1D Exercise

* Plot a 1D plot for continuous data

## 2D - Cont-Cont
We can use ``relplot``, ``lmplot``, and ``jointplot``

In [None]:
# relplot in 0.9.0
sns.relplot(x='city08', y='comb08', data=auto)

In [None]:
# Note return type (can use ax, better to use height/aspect)
fig, ax = plt.subplots(figsize=(6,6))
sns.relplot(x='city08', y='comb08', data=auto, alpha=.5, ax=ax)

In [None]:
# Note return type (can use ax, better to use height/aspect)
sns.relplot(x='city08', y='comb08', data=auto, alpha=.5,
            height=5, aspect=1.5)

In [None]:
auto.city08.corr(auto.comb08)

In [None]:
# Can add multiple dimensions with size (cont), hue (cat)
sns.relplot(x='city08', y='highway08', data=auto.sample(2000, random_state=42), alpha=.5,
            height=5, aspect=1.6, size='cylinders', hue='year')

In [None]:
# Can add multiple dimensions with size (cont), hue (cat)
# bump up size with sizes (min/max)
sns.relplot(x='city08', y='highway08', data=auto, alpha=.5,
            height=5, aspect=1.6, size='cylinders', 
            sizes=(3, 200), hue='year')

In [None]:
# Can add multiple dimensions with size (cont), hue (cat)
# bump up size with sizes (min/max)
# Can also facet with row/col
sns.relplot(x='city08', y='highway08', data=auto, alpha=.5,
            height=5, aspect=1, #size='cylinders', 
            col='cylinders', col_wrap=3,
            hue='year')

# use lmplot to draw regression
sns.lmplot(x='city08', y='comb08', data=auto)

In [None]:
# use lmplot to draw regression
sns.lmplot(x='city08', y='comb08', data=auto, lowess=True)

In [None]:
# use lmplot to draw regression
sns.lmplot(x='year', y='comb08', data=auto[auto.year > 2005])

In [None]:
# use lmplot to draw regression
# can add jitter (height 0.9 param, use size in 0.8)
sns.lmplot(x='year', y='comb08', data=auto[auto.year > 2005],
           x_jitter=.3,
           size=10
          )

In [None]:
# use lmplot to draw regression
# can add jitter
# use hue/row/col to add more dimensions
sns.lmplot(x='year', y='comb08', data=auto[(auto.year > 2005) &
                    (auto.make.isin(['Ford', 'Toyota']))],
          x_jitter=.2, size=10, #height=10,
           col='make')

In [None]:
# note return type
sns.jointplot(x='city08', y='comb08', data=auto, alpha=.5)

In [None]:
# attempt to change size
fig, ax = plt.subplots(figsize=(10,8))
sns.jointplot(x='city08', y='comb08', data=auto, alpha=.5, ax=ax)

In [None]:
# use (height parameter for 0.9, size for 0.8) for jointplot
jg = sns.jointplot(x='city08', y='comb08', data=auto, alpha=.5, #height=10)
                   size=10)

In [None]:
# ax_joint, and fig are matplotlib objects
print(dir(jg))

In [None]:
# can do different "kinds"
for k in 'scatter,reg,resid,kde,hex'.split(','):
    jg = sns.jointplot(x='city08', y='comb08', data=auto.sample(500), 
                       size=5,
                       kind=k).set_axis_labels(k)

In [None]:
# combining  with plot_joint
jg = sns.jointplot(x='year', y='highway08', data=auto.sample(1_000),
                  kind='kde')
jg.plot_joint(plt.scatter, c='k', marker='.', linewidth=1, alpha=.1)

In [None]:
# All the continuous values
# (Careful can take some time!)
sns.pairplot(auto.sample(1_000).dropna(), diag_kind="kde")

In [None]:
auto.year.corr(auto.highway08)

In [None]:

g = sns.PairGrid(auto.sample(500).dropna(), diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)

In [None]:
# All the continuous values
pg = sns.pairplot(auto.sample(1_000).dropna(), kind='reg')
pg.map_upper(sns.kdeplot)

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(auto.corr(), annot=True, cmap='RdBu', #rdBu 
            annot_kws={'size':10}, ax=ax,
            vmin=-1, vmax=1)

In [None]:
auto.corr()

In [None]:
# use figsize for clustermap
sns.clustermap(auto.corr(), annot=True, cmap='RdBu', 
               annot_kws={'size':10},
               figsize=(5,5),
               vmin=-1, vmax=1)

## 2D Cont-Cont Exercise
* Plot a 2D plot of two continuous variables
* Plot a pairplot with the data (or try a portion first)

## 2D Cat-Cont

In [None]:
auto.dtypes

In [None]:
# box, violin, boxen (0.9), point, bar, count
sns.boxplot(x='city08', y='make', data=auto)

In [None]:
sns.boxplot(x='city08', y='make', data=auto,
           order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.pointplot(x='city08', y='make', data=auto)

In [None]:
sns.pointplot(x='city08', y='make', data=auto,
             order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.violinplot(x='city08', y='make', data=auto,
             order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.barplot(x='city08', y='make', data=auto,
             order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
sns.boxenplot(x='city08', y='make', data=auto,
             order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

In [None]:
# Throw histograms on top of each other
# Useful for exploring classification models 
# (ie try to predict Ford/Toyota based on city08)
ax = sns.distplot(auto[auto.make=='Ford'].city08, label='Ford', hist=False)
ax = sns.distplot(auto[auto.make=='Toyota'].city08, ax=ax, label='Toyota', hist=False)
ax.legend()
ax.set_xlim(0, 40)

In [None]:
# catplot is a 0.9 feature
sns.catplot(x='city08', y='make', data=auto)

In [None]:
sns.catplot(y='city08', x='make', data=auto)

In [None]:
sns.catplot(x='city08', y='make', data=auto,
            order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
           alpha=.5)

In [None]:
# Swarm plot is cool - Sampling so we can see it
for kind in 'strip,swarm,box,violin,boxen,point,bar'.split(','):
    print("Kind", kind)
    cp = sns.catplot(x='city08', y='make', data=auto.sample(1000).dropna(),
                     order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
                     kind=kind)

In [None]:
# Sampled catplot
# can add extra dimension with hue
for kind in 'strip,swarm'.split(','):
    print("Kind", kind)
    cp = sns.catplot(x='city08', y='make', data=auto.sample(1000)[auto.year > 2005],
            order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
                    hue='year',
                    kind=kind)

In [None]:
# Sampled catplot
# can add extra dimension with hue
# can further "facet" with col/row
for kind in 'boxen,swarm'.split(','):
    print("Kind", kind)
    cp = sns.catplot(x='city08', y='make', data=auto.sample(5000)[auto.year > 2005],
                     order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'],
                     hue='year', hue_order=[2008, 2009, 2015],
                     col='cylinders',
                     col_wrap=3,
                     col_order=[4,6,8],
                kind=kind)

## 2D Cat-Cont Exercise:
* Using the ``catplot`` function, create two plots of your data using two different kinds of plots. Which plot kind works better for your plot? Why did you choose it?

## Cat-Cat
@randyzwitch I don't really like stacked bar charts, I'd suggest maybe using pointplot / factorplot with kind=point

— Michael Waskom (@michaelwaskom) September 4, 2014

Status 2018 - ``pointplot`` requires a continuous variable. ``factorplot`` - deprecated in favor of ``catplot``

In [None]:
sns.pointplot

In [None]:
# For count plots can only specify x or y (not both)
# Put one category in x|y the other in hue
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])
sns.catplot(y='make', hue='VClass', 
            data=auto[mask].sample(100), kind='count', size=10)

In [None]:
# stacked with pandas
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

(
    auto
    [mask]
    #.sample(100)
    .groupby(['VClass', 'make'])
    .size()
    .unstack()
    .fillna(0)
    .astype(int)
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
pd.crosstab(auto[mask].VClass, auto[mask].make)

In [None]:
# above is simpler with crosstab
mask = auto.make.isin(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])
(
    pd.crosstab(auto[mask].VClass, auto[mask].make)
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# normalized stacked with pandas
makes = ['Chevrolet', 'Toyota', 'Ford', 'Dodge', 'GMC']
mask = auto.make.isin(makes)
(
    auto
    .loc[mask]
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .pipe(lambda df: df.div(df.sum(1), axis=0))
    #.sum(1)
    .sort_values('Chevrolet', ascending=False) # highest to lowest
    [makes]   # this changes the stack orderdd
    .plot(kind='bar', stacked=True, width=1)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# not stacked with pandas
fig, ax = plt.subplots(figsize=(10,8))
(
    auto[mask]
    .sample(100)
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .plot(kind='bar', stacked=False, ax=ax)
)

In [None]:
# stacked with pandas
(
    auto[mask]
    .sample(100, random_state=42)
    .pipe(lambda df: pd.crosstab(df.make, df.VClass))
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

In [None]:
# comparing categorical with target (Classification)
# stacked with pandas
ford_mask = auto.make == 'Ford'
toyota_mask = auto.make == 'Toyota'
(
    auto[ford_mask | toyota_mask]
    .pipe(lambda df: pd.crosstab(df.VClass, df.make))
    .pipe(lambda df: df.div(df.sum(1), axis=0))
    .sort_values('Ford', ascending=False)
    .plot(kind='bar', stacked=True)
    .legend(bbox_to_anchor=(1,1))
)

## Cat-cat Exercise
* Plot a two category columns against each other

## Time series (date-cont)

In [None]:
# lineplot in 0.9
# code in pandas for 0.8
auto.groupby('year')['city08'].mean().plot()

In [None]:
# lineplot in 0.9
# code in pandas for 0.8
auto.groupby('year')['city08'].agg(['min', 'mean',  'median']).plot()

In [None]:
sns.lmplot(x='year', y='city08', data=auto)

In [None]:
# lineplot is 0.9
sns.lineplot(x='year', y='city08', data=auto)

In [None]:
fig, ax = plt.subplots(figsize=(6,4))
sns.lineplot(x='year', y='city08', data=auto, ax=ax)

In [None]:
# can use hue, style, size to add dimensions
fig, ax = plt.subplots(figsize=(10,8))
sns.lineplot(x='year', y='city08', hue='VClass', data=auto, ax=ax)

use relplot (kind='line') to facet by col/row

In [None]:
# can use hue, style, size to add dimensions
fig, ax = plt.subplots(figsize=(10,8))
#sns.lineplot(x='year', y='city08', size='VClass', data=auto, ax=ax)
auto.groupby(['year', 'VClass'])['city08'].mean().unstack().plot(ax=ax)
#sns.despine()

## Timeseries Exercise
* Plot a timeseries plot
* Add an extra dimension to it

## Seaborn Extras

In [None]:
# default style
sns.set() 

In [None]:
plt.plot(range(10))

In [None]:
# styles - white, dark, whitegrid, darkgrid, ticks
# use set_style for more permanence
with sns.axes_style('ticks'):
    plt.plot(range(10))

In [None]:
with sns.axes_style('ticks'):
    plt.plot(range(10))
    sns.despine()

In [None]:
sns.set_style('ticks')
plt.plot(range(10))

In [None]:
# get the current style - can pass in a dictionary like this as 2nd param to *_style
sns.axes_style()

In [None]:
s = {'axes.facecolor': 'pink'}
with sns.axes_style(None, s):
    plt.plot(range(10))

In [None]:
# reset to default values
sns.set()

In [None]:
plt.plot(range(10))

In [None]:
# paper, notebook, talk, poster
# set_context - permanent
# plotting_context - with statement
with sns.plotting_context('poster'):
    plt.plot(range(10))

In [None]:
# colors
# set_palette - permanent
# color_palette - context manager
# categorical (qualitative) - no order (deep, muted, pastel, bright, dark, and colorblind)
sns.palplot(sns.color_palette())

In [None]:
# "Circular" - For large numbers of categories
# HSLuv - equal luminance
sns.palplot(sns.color_palette('husl', 30))

In [None]:
# continuous (sequential) - Ordered  - unimportant to more important
sns.palplot(sns.color_palette('Blues'))

In [None]:
# continuous (sequential) - Ordered - unimportant to more important
# Reverse order by tacking _r onto end
sns.palplot(sns.color_palette('Blues_r'))

In [None]:
# continuous (diverging) - Ordered - High and low are interesting
sns.palplot(sns.color_palette('RdBu'))

In [None]:
# fancy tool to help (see docs for 1st param)
# also other sns.choose_*
sns.choose_colorbrewer_palette('diverging')

In [None]:
# Can specify own colors
bad = ['#c07fef', '#deadbe', '#fef70c']
sns.palplot(sns.color_palette(bad))

In [None]:
with sns.plotting_context('talk'):
    with sns.color_palette(sns.color_palette(bad)):
        sns.violinplot(x='city08', y='make', data=auto,
             order=['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'])

## Extras Exercise
* With the ``plotting_context`` change the size of one of your previous plots
* Create your own color palette for one of your plots. What type should it be (diverging, sequential, qualitative)?
* With the ``color_palette`` use your palette to update the plot