In [None]:
import numpy as np
np.set_printoptions(threshold=50)
from datascience import *


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

print(matplotlib.__version__)

Make sure you are using matplotlib version 2.0 (specifically not version 2.1; there is a bug in their implementation of histograms).

##  Binning


In [None]:
#data = 'http://inferentialthinking.com/notebooks/top_movies.csv'

data = 'top_movies.csv'
top = Table.read_table(data)
top


In [None]:
aged = top.with_column("Age", 2017-top.column('Year'))
aged

In [None]:
aged.group("Age")

In [None]:
#You can decide which ages to group together into bins
#inputs: (name of your bins, values for bounds)
#need not be evenly spaced

aged.bin('Age', bins = make_array(0, 5, 10, 20, 40, 65, 100))

In [None]:
#The last row of the table does not represent a bin. It represents the right endpoint of the last bin.

#What happens if your bins don't capture all of the data?

aged.bin('Age', bins = make_array(0, 5, 10, 20, 40))

In [None]:
#Does not realize that there is more data beyond the last bin. Choose bins carefully.

#What happened to the bin with left endpoint of 20? Compare to above.

aged.where('Age', 40)

In [None]:
#bin with no bins specified

aged.bin('Age').show()

#default bins are ten evenly spaced bins: (96-2)/10 = 94/10 = 9.4 width
#the last row is not really a bin (always empty) - just shows you where the previous bin ends

In [None]:
#create your own evenly spaced bins with np.arange

aged.bin('Age', bins=np.arange(0, 101, 20))

## Histograms

In [None]:
from IPython.display import Image
Image("hist.png", width=600)

In [None]:
#ready for a histogram with the ten default bins
aged.hist('Age')

In [None]:
#horizontal axis is a number line: fixed scale

#arrange better, with our five custom bins
aged.hist('Age', bins= np.arange(0, 101 , 20)) #include unit='year'

In [None]:
#change the vertical axis to be a count of how many movies fall into each bin
#why such a strange vertical axis by default?

aged.hist('Age', bins= np.arange(0, 101 , 20), normed=False)

back to slides

## Combining Bins

In [None]:
aged.hist('Age', bins=make_array(0, 20, 60, 80, 100), normed=True)

In [None]:
#why do we not just display the counts?
#option to do this is normed=False

aged.hist('Age', bins=make_array(0, 20, 60, 80, 100), normed=False)

This is not a histogram. This violates the area principle.

In [None]:
#remember what the distribution looked like

aged.hist('Age', bins=np.arange(0, 101, 5))

In [None]:
#a rougher approximation, combining some bins

aged.hist('Age', bins=make_array(0, 5, 10, 20, 50, 60, 65, 75, 80, 100), unit='year', normed=True)

In [None]:
#compare density of bin with 20 (five wide) vs bin with 85 (30 wide)

aged.bin('Age', bins=make_array(0, 5, 10, 20, 50, 60, 65, 75, 80, 100))

In [None]:
aged.hist('Age', bins=make_array(0, 5, 10, 20, 50, 60, 65, 75, 80, 100), unit='year',  normed = False)

In [None]:
#compare representation of 20 vs 85

## Overlaid Graphs

In [None]:
# data = 'http://inferentialthinking.com/notebooks/galton_subset.csv'
data = 'galton_subset.csv'
heights = Table.read_table(data)
heights = heights.select(2, 0, 1)
heights

In [None]:
heights.hist('son', unit='inch')

In [None]:
heights.hist('father', unit='inch')

In [None]:
heights.hist('son', 'father', unit='inch')

In [None]:
heights.hist('son', 'father', 'mother', unit='inch')

In [None]:
heights.hist(unit='inch')

In [None]:
heights.hist(bins=np.arange(55, 81, 1), unit='inch')
_ = plots.xlabel('Height (inches)')

In [None]:
#in histogram, can't tell which father goes with which son

heights.scatter('son', 'father')

In [None]:
heights.scatter('son', 'mother')

back to slides