In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 16

## Alameda County Jury Panels ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

# null: our jury panels were randomly selected from the population in alameda county
# alternative: our jury panels were NOT randomly selected (bias) from the pop
# statistic: TVD or Total Variation Distance, where we meausre distance between two columns (eligible and panels)
# and then take the sum and divide by 2
# TVD demonstrates the difference in our overall various ethnicity representation in our jury panel, and how it compares to the 
# TRUE proportion of ethnicities in the population
# larger TVD means more difference, looks less like population (overrepresentation/underrepresentation)
# smaller TVD means less difference, looks more like the population (equal or more appropriate representation)

**Question:** How could we visualize the data?

In [None]:
jury.barh('Ethnicity') # bar chart flipped horizontally

**Task:** Let's simulate under the assumption that the panel was selected randomly from the eligible population.
(There are 1423 people on the panel)

In [None]:
model = jury.column('Eligible')
model # this is our TRUE proportions of ethnicities in Alameda county
# first element is Asian, second is Black, ...

# let's simulate a random draw of 1423 jurors from the distribution
simulated = sample_proportions(1423, model)
len(simulated)
simulated

**Task:** Let's 
- Add a single simulation to our table (as a new column)
- Visualize the eligible proportions, the panel proportions and the simulated proportions

In [None]:
# Add a single simulation to our table (as a new column)

# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running this several times to confirm!
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
#Visualize the eligible proportions, the panel proportions and the simulated proportions
jury_with_simulated.barh('Ethnicity')

## Distance Between Distributions
In the last lecture, the difference between observed black/purple
and their expected values (26%/75%) was our statistic.

In this case, we need to understand how each of the 5 categories
differ from their expected values according to the model.

**Question**: How could we do this? How about the differences?

In [None]:
diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

---
back to slides

---
## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
# The TVD of a model simluation from its expected values
tvd(sample_proportions(1423, model), jury.column('Eligible'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1423, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
print('Observed TVD: ' + str(obsvd_tvd))

# Plotting details; ignore this code
plots.ylim(-2, 55)
plots.scatter(obsvd_tvd, 0, color='red', s=30);

---
back to slides

---