In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rpy2
%load_ext rmagic
%pylab inline

## Visualization of the Cars Dataset

In [None]:
df = pd.read_csv('data/cars_multivariate.csv', na_values=['?'] )
df.head()

### Make sure nominal/ordinal variables are interpretable

### GGPlot is your best bet for high-dimensional visual encodings
- GGPlot implements a _Grammar of Graphics_ (http://vita.had.co.nz/papers/layered-grammar.pdf)
- Still way behind the original R implementation
- We only teach you one tool, but you should learn many!!!

In [None]:
from ggplot import *

### Examine MPG as dependent variable

In [None]:
df.Cylinders.value_counts()

### Or, in R
- http://rpy.sourceforge.net/rpy2/doc-2.4/html/index.html
- requires installation of R

In [None]:
%%R -w 900 -h 600

library(ggplot2)
library(reshape2)
library(plyr)

# Load the data
df = read.csv('data/cars_multivariate.csv',na.strings = c('?'))
head(df)

# Plotting df
temp = df
temp = temp[temp$cylinders %in% c(4,6,8),]

# Process
temp$model <- temp$model + 1900
temp$Era = cut(temp$model, 4)
# temp$Acceleration = cut(temp$acceleration, 2)
temp$Cylinders = paste0(temp$cylinders, '-cylinder')
temp$Origin <- factor(temp$origin, labels=c('USA', 'Japan', 'Europe'))

# Plot
p = ggplot(temp, aes(x = horsepower, 
                     y = mpg, 
                     size = weight,
                     color = log(displacement),
                     shape = Cylinders)) +
  geom_point() +
  scale_color_gradient(low='green', high='red') +
#   scale_size_continuous(range=c(5,10)) +
#   scale_shape_manual(values=c(95,3)) + 
  theme_bw() +
  facet_grid(Origin ~ Era)
print(p)


### Editorialize: US cars are bigger and less efficient than foreign cars
- Bar charts of Average mileage, weight, and displacement

### R implementation has a few more features

In [None]:
%%R -w 900 -h 600

# Using histograms
temp$is_us <- factor(temp$Origin == 'USA', labels = c('Foreign', 'US'))
melted <- melt(temp, id.vars=c('is_us'), measure.vars = c('mpg','weight','displacement'))
agg <- ddply(melted, .(is_us,variable), summarise,
             value = mean(value))
p <- ggplot(melted, aes(x=value, fill=is_us)) +
  geom_histogram(alpha=.9) +
  scale_fill_brewer(palette='Set1', name='') +
  theme_bw() +
  xlab('') +
  ylab('') +
  facet_grid(.~variable, scales='free') + 
  ggtitle('Weights and mileage for cars manufactured by US companies \n compared to foreign companies')
print(p)


### Matplotlib

In [None]:
fig, ax = plt.subplots(1,3)
for i,v in enumerate(['weight','mpg','displacement']):
    a = ax[i]
    agg = temp.groupby('is_us')[v].mean()
    agg.plot(kind='bar', ax=a)
    a.set_title(v)
    a.set_xlabel('')
plt.tight_layout()
plt.show()