In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# German Warplanes

In [None]:
serialno = Table().with_column('Serial number', np.arange(1, 301))

repetitions = 1000
sample_size = 30
doubles_mean = make_array()
max_array = make_array()

for i in np.arange(repetitions):
    sample = serialno.sample(sample_size).column(0)
    max_array = np.append(max_array, np.max(sample))
    doubles_mean = np.append(doubles_mean, np.mean(sample)*2)

estimates = Table().with_columns("2*mean", doubles_mean, 'max', max_array)
estimates

In [None]:
estimates.hist(bins = np.arange(1, 400, 10))

Can we come up with a new estimate that's less biased than the largest number seen, and less variable than twice the average?





















# Jury selection

In [None]:
# 11 felony trials and 1453 people who reported for jury service
# Looked at the ethnic composition of those 1453 people
# as compared to the ethnic composition of eligible jurors 

# We are going to look at these two distributions and compare them. 
# Column "Eligible": proportions in the eligible population 
# Column "Actual": proportions in those who reported for jury service 


jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Actual', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
#to compare:

jury.barh('Ethnicity')

In [None]:
#What ethnicities were under-represented based on the bar chart above?

In [None]:
#Main question: If you select at random from blue distribution, you do not expect to get exactly blue distribution,
#it will be off. Is the random selection off like a random sample would be off? Or is it off in some other way?

#Let's quantify word "OFF"

### Measure of the difference between two distributions.

In [None]:
# Augment the table with a column of differences between proportions

jury_with_diffs = jury.with_column(
    'Difference', jury.column('Actual') - jury.column('Eligible')
)
jury_with_diffs

Exercise: Find the total in the last column by adding the positive differences and then adding the negative differences

In [None]:


# Reason: (x1-y1) + (x2-y2) + (x3-y3) + (x4-y4) = 0
#         (x1 + x2 + x3 + x4) - (y1 + y2 + y3 + y4) = 1 - 1 =0. 

# Therefore, averaging the distances does not make sense. 

# In fact, the sum of the positives, 0.14, is a measure of the distance between the two distributions

In [None]:
#To avoid the cancellation, we drop the negative signs

jury_with_diffs = jury_with_diffs.with_column(
    'Abs. Difference', np.abs(jury_with_diffs.column('Difference'))
)

jury_with_diffs

In [None]:
#then you add them up and divide by 2:
#total variation distance between the two distributions
#(back to slides)

jury_with_diffs.column('Abs. Difference').sum()/2

In [None]:
#takes two arrays (representing distributions)
#returns total variation distance between them

def total_variation_distance (distribution_1, distribution_2):
    '''Function that computes total variation distance between two arrays'''
    return np.abs(distribution_1-distribution_2).sum()/2

In [None]:
#what is the purpose of this function?

def table_tvd(table, label, other_label):
    '''What does it do?'''
    return total_variation_distance(table.column(label),table.column(other_label))
 

In [None]:
#What answer do you expect?
table_tvd(jury, 'Eligible', 'Actual')

In [None]:
# Step back: What was our goal?













In [None]:
panel_size = 1453

In [None]:
#proportions_from_distribution method is defined for you
panels_and_sample = proportions_from_distribution(jury, 'Eligible', panel_size)
panels_and_sample

What does this function do? According to the documentation, 

#### proportions_from_distribution(table, label, sample_size, column_name='Random Sample')

Adds a column named column_name containing the proportions of a random draw using the distribution in label.

In [None]:
panels_and_sample.barh('Ethnicity')
#what you see here?

We will simulate the total variation distance between the distribution of eligible jurors and a random sample from that distribution. 

In [None]:
table_tvd(panels_and_sample, 'Eligible', 'Random Sample')

In [None]:
# same thing, many times

panel_size = 1453
repetitions = 5000

tvds = make_array()

for i in np.arange(repetitions):

    new_sample = proportions_from_distribution(jury, 'Eligible', panel_size)
    tvds = np.append(tvds, table_tvd(new_sample, 'Eligible', 'Random Sample'))

results = Table().with_column('TVD', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.2, 0.005))

This is an empirical distribution of our statistic, the total variation distance.

Back to slides for question.

# San Francisco Employee Incomes

In [None]:
data = 'san_francisco_2015.csv' # 'http://inferentialthinking.com/notebooks/san_francisco_2015.csv'

sf = Table.read_table(data).select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(5)

In [None]:
sf.sort(2, descending  =  True)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins, unit='dollar')

In [None]:
#suppose we only have access to the sample:

sample_from_population = sf.sample(200, with_replacement=False)  #unique workers 
sample_from_population.show(3)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sample_from_population.hist(2, bins=comp_bins, unit='dollar')

What is the median compensation in the population? This is a parameter.
Presumably, you would not know it.

Use a statistic instead.
Calculate the median of the sample, and use this sample median as an estimate for the population median.

In [None]:
np.median(sample_from_population.column(2))

back to slides for Variability of the Estimate

## Bootstrap

In [None]:
sample_from_population.show(3)

In [None]:
np.median(sample_from_population.column(2))


In [None]:
# it does the bootstrap for you

resample = sample_from_population.sample()
resample.show(3)


In [None]:
resample.select("Total Compensation").hist(bins = np.arange(0, 700000, 25000))

In [None]:
np.median(resample.column(2))

Now we have another estimate for the population median. Do it again, a thousand times!

In [None]:
resampled_medians = []

for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column(2))
    resampled_medians.append(median)

Table().with_column(
    "Resampled median", resampled_medians
).hist(unit='dollar')

What is the difference between the two histograms above?

In [None]:
# Where is the true population median? We have the luxury of being able to access the full population.
# Normally, only have the sample (and resamples) from which to make a guess.

np.median(sf.column(2))

In [None]:
Table().with_column('Resampled median', resampled_medians).hist(unit='dollar')
pop_median = np.median(sf.column(2))
plots.scatter(pop_median, 0, color='red', s=400)