In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

plt.style.use('fivethirtyeight')

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
   "livereveal", {
       'width': 1200,
       'height': 700,
       "scroll": True,
})

## Announcements

* CAPE evaluations
* Final exam is in two days
    - Saturday 8-11am
    - seating chart on Piazza
    - bring student ID, pen or pencil, no calculators
    - reference sheet provided
* Studying
    - DSC Study Jam tonight 5:30-10pm
    - Practice finals on Piazza

In [None]:
restaurants = Table.read_table('restaurants.csv')
restaurants.labels

In [None]:
restaurants = restaurants.select('business_name', 'inspection_date', 'inspection_score', 'risk_category', 'Neighborhoods', 'Zip Codes')
restaurants 

In [None]:
at_risk = restaurants.where('inspection_score', are.above(-1)).where('risk_category', are.not_equal_to('nan'))
at_risk.hist('inspection_score')#, group='risk_category')

We want to compare high risk restaurants to low risk restaurants and see if their inspection scores are different. What technique should we use?

A. A/B testing  
B. Standard hypothesis testing  
C. Bootstrapping  
D. Confidence intervals

Can you state the null and alternative hypotheses?

In [None]:
high_low = at_risk.where('risk_category', are.contained_in(['High Risk', 'Low Risk']))
high_low = high_low.select('inspection_score', 'risk_category')
high_low

In [None]:
high_low.sample(with_replacement = False)

In [None]:
shuffled_labels = high_low.sample(with_replacement = False).column(1)
original_and_shuffled = high_low.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled

In [None]:
original_and_shuffled.hist('inspection_score', group='risk_category')

In [None]:
original_and_shuffled.hist('inspection_score', group='Shuffled Label')

It looks like the two groups in the first histogram (A and B) are susbstantially more different than the two groups in the second histogram (C and D). What test statistic(s) can we use to quantify the difference between the two groups displayed in a given histogram?

A. total variation distance  
B. difference in the means  
C. either of the above

In [None]:
original_and_shuffled.group('risk_category', np.mean)

In [None]:
original_and_shuffled.group('Shuffled Label', np.mean)

In [None]:
observed_means = original_and_shuffled.group('risk_category', np.mean).column('inspection_score mean')
observed_difference = observed_means.item(1) - observed_means.item(0)
observed_difference

In [None]:
simulated_means = original_and_shuffled.group('Shuffled Label', np.mean).column('inspection_score mean')
simulated_difference = simulated_means.item(1) - simulated_means.item(0)
simulated_difference

In [None]:
def calculate_test_statistic():
    shuffled_labels = high_low.sample(with_replacement = False).column(1)
    original_and_shuffled = high_low.with_column('Shuffled Label', shuffled_labels)
    simulated_means = original_and_shuffled.group('Shuffled Label', np.mean).column('inspection_score mean')
    simulated_difference = simulated_means.item(1) - simulated_means.item(0)
    return simulated_difference

In [None]:
calculate_test_statistic()

In [None]:
simulated_stats = make_array()

for i in np.arange(100):
    sim_stat = calculate_test_statistic()
    simulated_stats = np.append(simulated_stats, sim_stat)

In [None]:
np.count_nonzero(simulated_stats>0)

In [None]:
Table().with_column('Simulated Differences', simulated_stats).hist()

In [None]:
observed_difference

What's the p-value?

In [None]:
np.count_nonzero(simulated_difference>=observed_difference)/100

You work as a family physician and you want to test the following hypotheses:

Null Hypothesis: Family physicians see an equal number of children and adults.

Alternative Hypothesis: Family physicians see an unequal number of children and adults.

You collect data and you find that in 6354 patients, 3115 were children and 3239 were adults.

Which test statistic(s) could be used for this hypothesis test? Which values of the test statistic point towards the alternative?

A. proportion of children seen   
B. number of children seen  
C. number of children minus number of adults seen  
D. absolute value of number of children minus number of adults seen  

What if we used a different alternative hypothesis? Which test statistics would work then? 

How do you generate one value of the test statistic?

In [None]:
sample_proportions(6354, make_array(0.5, 0.5)).item(0)*6354

Can you do it without using sample_proportions?

In [None]:
results = Table().with_column('Patient', make_array('C', 'A')).sample(6354, with_replacement=True).column('Patient')
np.count_nonzero(results=='C')

Is this an example of bootstrapping?  
A. Yes, because we are sampling with replacment.  
B. No, this is not bootstrapping.

In [None]:
test_stats = make_array()

for i in np.arange(10000):
    stat = sample_proportions(6354, make_array(0.5, 0.5)).item(0)*6354
    test_stats = np.append(test_stats, stat)

In [None]:
Table().with_column('Number of Children', test_stats).hist()

Observed data: You collect data and you find that in 6354 patients, 3115 were children and 3239 were adults.

A. reject the null  
B. fail to reject the null  
C. not sure  

In [None]:
np.count_nonzero(kids_array<=3115)/10000

The Central Limit Theorem

> The distribution of sums (and averages) of large random samples (w/ replacement) are roughly normal, regardless of the distribution of the population from which the sample was drawn

In [None]:
bakeries = restaurants.where('business_name', are.containing('Bake')).where('inspection_score', are.above(-1))
bakeries

In [None]:
bakeries.sample(200)

In [None]:
bakeries.sample(200).column('inspection_score').mean()

In [None]:
sample_means = make_array()

for i in np.arange(10000):
    sample_mean = bakeries.sample(200).column('inspection_score').mean()
    sample_means = np.append(sample_means, sample_mean)

Distribution of the Sample Mean

In [None]:
Table().with_column('Sample Mean', sample_means).hist()

In [None]:
np.mean(sample_means), np.std(sample_means)

Sample: A random 200 bakeries

In [None]:
one_sample = bakeries.sample(200)
one_sample.hist('inspection_score')

In [None]:
np.mean(one_sample.column('inspection_score')), np.std(one_sample.column('inspection_score'))

Population: All bakeries in San Francisco with an inspection score

In [None]:
bakeries.hist('inspection_score')

In [None]:
np.mean(bakeries.column('inspection_score')), np.std(bakeries.column('inspection_score'))

According to the Central Limit Theorem, the SD of the distribution of the sample mean

In [None]:
np.std(bakeries.column('inspection_score'))/np.sqrt(200)

In [None]:
np.std(sample_means)

In [None]:
one_sample

Based on my one sample of 200 bakeries, how can we estimate the median inspection score of all bakeries in San Francisco with an inspection score? What technique should we use?

A. A/B testing  
B. Standard hypothesis testing  
C. Bootstrapping  
D. Confidence intervals

In [None]:
np.median(one_sample.column(2))

In [None]:
np.median(bakeries.sample(bakeries.num_rows, with_replacement=True).column(2))

In [None]:
boot_medians = make_array()

for i in np.arange(5000):
    boot_medians = np.append(boot_medians, np.median(bakeries.sample(bakeries.num_rows, with_replacement=True).column(2)))

In [None]:
Table().with_column('Bootstrapped Medians', boot_medians).hist()

In [None]:
percentile(2.5, boot_medians)

In [None]:
percentile(97.5, boot_medians)

Which of the following interpretations of this confidence interval is valid?  
A=True, B=False

1. 95% of SF bakeries have an inspection score between 83 and 86.  
2. 95% of the resamples have a median inspection score between 83 and 86.  
3. There is a 95% chance that our sample has a median inspection score between 83 and 86.  
4. There is a 95% chance that the median inspecition score of all SF bakeries is between 83 and 86.  
5.  If we had taken 100 samples from the same population, about 95 of these samples would have a median inspection score between 83 and 86.  
6.  If we had taken 100 samples from the same population, about 95 of the confidence intervals created would contain the median inspection score of all SF bakeries.  

Probability Distribution vs. Empirical Distribution of a Statistic

In [None]:
num_flips=100
np.random.choice(make_array('H', 'T'), num_flips)

In [None]:
flips = Table().with_column('outcome', np.random.choice(make_array('H', 'T'), num_flips))
flips

Statistic: proportion of heads

In [None]:
flips.group('outcome')

Histograms

In [None]:
bakeries

In [None]:
binned = bakeries.bin('inspection_score', bins=make_array(0, 50, 60, 80, 90, 95, 100))
binned                                              

Can you draw the histogram based on this?

In [None]:
binned = binned.with_column('percent', binned.column(1)/binned.column(1).sum()*100)
binned

In [None]:
bakeries.hist('inspection_score', bins=make_array(0, 50, 60, 80, 90, 95, 100))

Galton's Method

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
hybrid.scatter('acceleration', 'msrp')

What would Galton's method predict for the MSRP of a car with acceleration of 20 units?  
A. 55,000  
B. 65,000  
C. 80,000  
D. 100,000  