In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Planes

In [None]:
N = 300
serialno = Table().with_column('Serial number', np.arange(1, 300))
serialno

In [None]:
serialno.sample(30)

In [None]:
serialno.sample(30).column(0).max()

In [None]:
repetitions = 1000
sample_size = 30
maxes = make_array()
for i in np.arange(repetitions):
    m = serialno.sample(sample_size).column(0).max()
    maxes = np.append(maxes, m)
maxes

In [None]:
estimates = Table().with_column("estimated_N", maxes)
estimates

In [None]:
estimates.hist(0)

In [None]:
estimates.hist(0, bins = np.arange(1, 400, 10))

In [None]:
# back to slides

## Another Estimate

In [None]:
repetitions = 1000
sample_size = 30
maxes = make_array()
doubles = make_array()

for i in np.arange(repetitions):
    sample = serialno.sample(sample_size).column(0)
    maxes = np.append(maxes, sample.max())
    doubles = np.append(doubles, np.average(sample)*2)

estimates = Table().with_columns("largest # seen", maxes, '2*average', doubles)
estimates

In [None]:
estimates.hist(bins = np.arange(1, 400, 10))

In [None]:
#back to slides

##  Bias and Variance

Can we come up with a new estimate that's less biased than the largest number seen, and less variable than twice the average?





















### Clever Esimate

In [None]:


























N = 300
serialno = Table().with_column('Serial number', np.arange(1, 300))
serialno

repetitions = 1000
sample_size = 30
maxes = make_array()
doubles = make_array()  # 2*np.average(observation)
max_plus_min = make_array()  #max plus  min values from the sample


for i in np.arange(repetitions):
    sample = serialno.sample(sample_size).column(0)
    maxes = np.append(maxes, sample.max())
    doubles = np.append(doubles, np.average(sample)*2)
    max_plus_min = np.append(max_plus_min, sample.max()+sample.min())


estimates = Table().with_columns("largest # seen", maxes, '2*average', doubles, 'max+min', max_plus_min)
estimates

In [None]:
estimates.hist(bins = np.arange(1, 400, 10))

In [None]:
#back to slides

# Jury selection

In [None]:
# 11 felony trials and 1453 poeple who reported for jury service
# Looked at the ethnic composition of those 1453 people
# as compared to the ethnic composition of eligible jurors 

# We are going to look at these two distributions and compare them. 
# Column "Eligible": proportions in the eligible population (elaborate later) 
# Column "Panel": among 1453 people these are the distributions. 


jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Actual', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
#to compare:

jury.barh('Ethnicity')

In [None]:
#What ethnicities were under-represented based on the bar chart above?

In [None]:
#Main question: If you select at random from blue distribution, you do not expect to get exactly blue distribution,
#it will be off. Is the random selection off like a random sample would be off? Or is it off in some other way?

#Let's quantify word "OFF"

### Measure of the difference between two distributions.

In [None]:
# Augment the table with a column of differences between proportions

jury_with_diffs = jury.with_column(
    'Difference', jury.column('Actual') - jury.column('Eligible')
)
jury_with_diffs

In [None]:
# 1. Exercise for you: Add the positive differences and then add the negative differences

# Reason: (x1-y1) + (x2-y2) + (x3-y3) + (x4-y4) = 0
#         (x1 + x2 + x3 + x4) - (y1 + y2 + y3 + y4) = 1 - 1 =0. 

# 2. Therefore, averaging the distances does not make sense. 

# 3. 0.14 is a measure of the distance between the two. 

In [None]:
#To avoid the cancellation, we drop the negative signs

jury_with_diffs = jury_with_diffs.with_column(
    'Abs. Difference', np.abs(jury_with_diffs.column('Difference'))
)

jury_with_diffs

In [None]:
#then you add them up and divide by 2:
#total variation distance between the two distributions
#(back to slides)

jury_with_diffs.column('Abs. Difference').sum()/2

In [None]:
#takes two arrays (with distributions)
#returns total variation distance between them

def total_variation_distance (distribution_1, distribution_2):
    '''Function that computes total variation distance between two arrays'''
    return np.abs(distribution_1-distribution_2).sum()/2

In [None]:
#what is the purpose of this function?

def table_tvd(table, label, other_label):
    '''What does it do?'''
    return total_variation_distance(table.column(label),table.column(other_label))
 

In [None]:
#What answer do you expect?
table_tvd(jury, 'Eligible', 'Actual')

In [None]:
# Step back: What was our goal?
# Please, talk to each other to come up with a plan to achieve this goal.













In [None]:
panel_size = 1453

In [None]:
#proportions_from_distribution method is defined for you
panels_and_sample = proportions_from_distribution(jury, 'Eligible', panel_size)
panels_and_sample

What does this function do? According to the documentation, 

### proportions_from_distribution(table, label, sample_size, column_name='Random Sample')

Adds a column named column_name containing the proportions of a random draw using the distribution in label.

In [None]:
panels_and_sample.barh('Ethnicity')
#what you see here?

In [None]:
#we are going to simulate the total variation distance 
#between distribution of eligible jurors and a random sample from that distribution 

In [None]:
table_tvd(panels_and_sample, 'Eligible', 'Random Sample')

In [None]:
# same thing, many times

panel_size = 1453
repetitions = 5000

tvds = make_array()

for i in np.arange(repetitions):

    new_sample = proportions_from_distribution(jury, 'Eligible', panel_size)
    tvds = np.append(tvds, table_tvd(new_sample, 'Eligible', 'Random Sample'))

results = Table().with_column('TVD', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.2, 0.005))

In [None]:
#back to slides

# San Francisco Employee Incomes

In [None]:
data = 'san_francisco_2015.csv' # 'http://inferentialthinking.com/notebooks/san_francisco_2015.csv'

sf = Table.read_table(data).select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(3)

In [None]:
sf.sort(2, descending  =  True)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins, unit='dollar')

In [None]:
#suppose we only have access to the sample:

sample_from_population = sf.sample(200, with_replacement=False)  #unique workers 
sample_from_population.show(3)

In [None]:
# What is the median compensation in the population? Presumably, you do not know it. 
np.median(sample_from_population.column(2))

# Is this median accurate?

In [None]:
#back to slides

## Aside: lists and append

In [None]:
a = make_array(2, 3, 4)
a

In [None]:
np.append(a, 5)


In [None]:
a

In [None]:
b = [2, 3, 4]
b

In [None]:
# this append is different from np.append 
b.append(5)
b

## Sample variability

In [None]:
# loop in order to see the variation of the medians. 

medians = []
repetitions = 100

for i in np.arange(repetitions):
    sample_from_population = sf.sample(200, with_replacement=False)
    medians.append(np.median(sample_from_population.column(2)))

Table().with_columns(
    'i', np.arange(100),
    'median', medians,
).scatter('i')


#what do you see on this scatter plot?

In [None]:
#(back to slides)