In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 17 ##

## Swain vs. Alabama

In [None]:
# Let's make an array of population proportions in Talladega County in 1965 (26% black, 74% non-black)
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
# Let's use the sample_proportions function to randomly sample 100 times from a population with the above 
# distribution.
sample_proportions(100, population_proportions)

In [None]:
# Let's define a function which will randomly sample from the popultion and return the proportion of black jurors
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
# Let's run this 10000 times and record results:
# Note that this pattern is often used!  Make sure you understand it.
panels = make_array()

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
# Let's make a histogram:

Table().with_column(
    'Number of Black People on Panel of 100', panels
).hist(bins=np.arange(5.5,40.))

# Plotting details; ignore this code
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);

## Mendel and Pea Flowers

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers.  
observed_purples = 709 / 929
observed_purples

In [None]:
# Does the evidence Mendel collected represent evidence against the model that 75% of flowers are purple?
# Let's simulate the evidence we would obtain if the model were true:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
purple_flowers()

In [None]:
# Let's simulate our model 10000 times, and append the proportion of purple flowers to an array.
# Note this pattern is very often used!  Make sure you understand it.
purples = make_array()

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)

In [None]:
# Let's visualize the values of our simulated data:
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()

In [None]:
# Let's apply a statistic and see its distribution:
Table().with_column('Discrepancy in sample of 929 if the model is true', abs(purples- 75)).hist()

In [None]:
# What's the value of the statistic for the data that Mendel observed?
abs(observed_purples * 100 - 75)

## Alameda County Jury Panels ##

In [None]:
# Load ethnicity data for Alameda County in 2010:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
# Let's visualize with a bar chart.  Using only one categorical variable as argument will give bars for the 
# other quantitative bars.

jury.barh('Ethnicity')

In [None]:
# Under the model, this is the true distribution of people
# from which the jurors are randomly sampled.  
model = make_array(0.15, 0.18, 0.12, 0.54, 0.01)

In [None]:
# Let's simulate a random draw of 1423 jurors from this distribution
simulated = sample_proportions(1423, model)
simulated

In [None]:
# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running this several times to confirm!
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
jury_with_simulated.barh('Ethnicity')

## Distance Between Distributions

In [None]:
# In the last lecture, the difference between observed black/purple
# and their expected values (26%/75%) was our statistic.
#
# In this case, we need to understand how each of the 5 categories
# differ from their expected values according to the model.

diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
# The TVD of a model simluation from its expected values
tvd(sample_proportions(1423, model), jury.column('Eligible'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1423, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
print('Observed TVD: ' + str(obsvd_tvd))