In [None]:
import numpy as np
from datascience import *
%matplotlib inline

np.set_printoptions(threshold=50)
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Baby Weights

In [None]:
# http://inferentialthinking.com/notebooks/baby.csv
births = Table.read_table('baby.csv')
births.show(3)

In [None]:
babies = births.select(0, 1)
babies.show(3)

In [None]:
ratios = babies.with_column(
    'Ratio BW:GD', babies.column(0) / babies.column(1)
)
ratios.show(3)

In [None]:
ratios.hist(2)

In [None]:
np.median(ratios.column(2))

In [None]:
# 5000 bootstrap samples
# We've taken the median of each of these samples
# Plotted them on the histogram
# We've constructed the 95% confidence interval (in yellow)

def bootstrap_median(sample_from_population, label, repetitions):
    resampled_medians = []
    for i in np.arange(repetitions):
        resample = sample_from_population.sample()
        median = np.median(resample.column(label))
        resampled_medians.append(median)
    return resampled_medians

resampled_medians = bootstrap_median(ratios, 2, 5000)

interval_95 = make_array(
    percentile(2.5, resampled_medians),
    percentile(97.5, resampled_medians)
)

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_95, [0, 0], color='gold', lw=8)
print('Approximate 95% Bootstrap Confidence Interval for the Population Median')
print(np.round(interval_95, 4))

In [None]:
interval_80 = make_array(
    percentile(10, resampled_medians),
    percentile(90, resampled_medians)
)

Table().with_column('Resampled median', resampled_medians).hist(0)
plots.plot(interval_80, [0, 0], color='gold', lw=8)
print('Approximate 80% Bootstrap Confidence Interval for the Population Median')

print(np.round(interval_80, 4))

In [None]:
def bootstrap_mean(sample_from_population, label, repetitions):
    resampled_means = []
    for i in np.arange(repetitions):
        resample = sample_from_population.sample()
        mean = np.mean(resample.column(label))
        resampled_means.append(mean)
    return resampled_means

In [None]:
def bootstrap_ci_mean(sample_from_population, label, repetitions):
    resampled_means = bootstrap_mean(sample_from_population, label, repetitions)
    
    interval_95 = make_array(
        percentile(2.5, resampled_means),
        percentile(97.5, resampled_means)
    )
    
    Table().with_column('Resampled mean', resampled_means).hist(0)
    plots.plot(interval_95, [0, 0], color='gold', lw=8)
    print('Approximate 95% Bootstrap Confidence Interval for Population Mean:')
    print(np.round(interval_95, 3))

In [None]:
births.show(3)

In [None]:
np.median(births.column("Maternal Age"))

In [None]:
bootstrap_ci_mean(births, 'Maternal Age', 5000)

<img src="ci.PNG" width=600> 

In [None]:
births.hist('Maternal Age')

back to slides for more details on percentiles

# Percentile

In [None]:
s = [1, 7, 3, 9, 5]
percentile(10, s) == 0

In [None]:
percentile(39, s) == percentile(40, s)

In [None]:
percentile(40, s) == percentile(41, s)

In [None]:
percentile(50, s) == 5

# Mean: Basics

In [None]:
values = [2, 3, 3, 9]

#below are four ways to compute the average

In [None]:
np.average(values)


In [None]:
np.mean(values)


In [None]:
sum(values) / len(values)


In [None]:
2*(1/4) + 3*(1/2) + 9*(1/4)

In [None]:
from IPython.display import Image
Image("weights.png", width=600)

## Means and Histograms

In [None]:
#histogram of values

t = Table().with_columns('Value', make_array(2, 3, 9),
                        'Prop.', make_array(0.25, 0.5, 0.25))

t.hist(bin_column = 'Value', bins = np.arange(1.5, 9.6, 1))
plots.plot([1.5, 9.5], [0, 0], color = 'grey', lw = 8)


This is also a histogram for the values (2, 2, 3, 3, 3, 3, 9, 9). True/False?

A: True

B: False

In [None]:
# Balance point is the average

mean = sum (t.column("Value")*t.column("Prop."))
t.hist(bin_column = 'Value', bins = np.arange(1.5, 9.6, 1))
plots.scatter(mean, -0.009, marker = '^', color = 'darkblue', s=60)
plots.plot([1.5, 9.5], [0, 0], color = 'grey', lw = 8)

In [None]:
# symmetric histogram balances on the line of symmetry

t1 = Table().with_columns('Value', make_array(2, 3, 4, 9),
                        'Prop.', make_array(0.25, 0.5, 0.25, 0))

mean1 = sum (t1.column("Value")*t1.column("Prop."))
t1.hist(bin_column = 'Value', bins = np.arange(1.5, 4.6, 1))
plots.scatter(mean1, -0.009, marker = '^', color = 'darkblue', s=60)
plots.plot([1.5, 9.5], [0, 0], color = 'grey', lw = 8)
plots.xlim(1, 10)


In [None]:
#destroy symmetry

t1 = t1.with_columns('Not_symmetric', make_array(0.25, 0.5, 0, 0.25))

mean2 = sum (t1.column("Value")*t1.column("Not_symmetric"))
t1.hist(bin_column = 'Value', bins = np.arange(1.5, 9.6, 1))
plots.scatter(mean1, -0.009, marker = '^', color = 'darkblue', s=60)
plots.scatter(mean2, -0.009, marker = '^', color = 'gold', s=60)

plots.plot([1.5, 9.5], [0, 0], color = 'grey', lw = 8)


Mean gets pulled up by large values.



## Discussion Questions


In [None]:
from IPython.display import Image
Image("q17.png", width=600)

In [None]:
from IPython.display import Image
Image("q16.png", width=600)

In [None]:
# Let's try again. Which is larger, mean or median?

sf2015 = Table.read_table('san_francisco_2015.csv').where('Salaries', are.above(10000))

sf2015.select('Total Compensation').hist(bins = np.arange(10000, 700000, 25000))

In [None]:
compensation  = sf2015.column("Total Compensation")
percentile (50, compensation)

In [None]:
np.mean(compensation)

back to slides for properties of the mean

## Measuring Variability


In [None]:
values = make_array(2, 3, 3, 9)
values

# how far are these numbers from the mean? 

In [None]:
# Step 1, what is the mean?

average = np.average(values)
average

In [None]:
# Step 2, the deviations from average
# how far from the average? (each value)

deviations = values - average
deviations

In [None]:
t = Table().with_columns(
    'Value', values,
    'Deviation from Average', deviations,
)
t

Do the values tend to be far from the average? 

In [None]:
# Let's take the average of the deviations

np.average(t.column(1))

In [None]:
# Step 3. The squared deviations from average
# square them -> positive

t = t.with_column('Squared Deviations', deviations ** 2)
t

In [None]:
# Step 4: Variance = the mean squared deviation from average
# Units have changed. 

variance = np.mean(t.column('Squared Deviations'))
variance

In [None]:
# Step 5: Standard deviation: root mean squared deviation from average
# Steps of calculation:         5    4     3        2             1 
sd = np.sqrt(variance)
sd

In [None]:
#the bigger the standard deviation (or the bigger the variance), the more spread out the distribution 
np.std(values)

back to slides for recap and discussion questions

## Interpreting Standard Deviation

In [None]:
#http://inferentialthinking.com/notebooks/nba2013.csv
nba13 = Table.read_table('nba2013.csv')
nba13

In [None]:
nba13.select('Height').hist(bins=np.arange(68, 88, 1))

In [None]:
mean_height = np.mean(nba13.column('Height'))
mean_height

In [None]:
median_height = np.median(nba13.column('Height'))
median_height

In [None]:
mean_height = np.mean(nba13.column('Height'))
sd_height = np.std(nba13.column('Height'))
(mean_height, sd_height)

In [None]:
# tallest person
nba13.sort('Height', descending=True).show(3)

In [None]:
#shortest player
nba13.sort('Height').show(3)

In [None]:
# for tallest player. How many inches above the mean?
87 - mean_height

How many SDs above the mean? We calculated SD = 3.45.

In [None]:
# look at the deviation relative to SD
(87 - mean_height)/sd_height


In [None]:
# Now for the shortest player
# how many SDs below average?

(69 - mean_height)/sd_height

# We start working with Standard Units
# They measure how many SDs above average.

In [None]:
#conversion to standard units

def standard_units(numbers_array):
    "Convert any array of numbers to standard units."
    return (numbers_array - np.mean(numbers_array))/np.std(numbers_array)    

We can convert heights to standard units, from the original units of inches. 

We saw that 87 inches was about 2.3 SU, and 69 inches was about -2.9 SU.

What values are possible for SU? Can they all be positive? Can there be a value like 9 SU? 

In [None]:
nba13 = nba13.with_column(
    'Height (Standard Units)', standard_units(nba13.column('Height'))
)
nba13.sort(0)

In [None]:
from IPython.display import Image
Image("su.png", width=600)

## Standard Units and Chebyshev Bounds

In [None]:
#http://inferentialthinking.com/notebooks/united_summer2015.csv
united = Table.read_table('united_summer2015.csv')
united = united.with_column(
    'Delay (Standard Units)', standard_units(united.column('Delay'))
)
united

In [None]:
np.std(united.column(3))

In [None]:
united.sort('Delay', descending=True)

In [None]:
within_3_sd = united.where('Delay (Standard Units)', are.between(-3, 3))
within_3_sd.num_rows/united.num_rows

In [None]:
united.hist('Delay (Standard Units)', bins=np.arange(-5, 15.5, 0.5))
plots.xticks(np.arange(-6, 17, 3));

back to slides for Chebyshev Bound slides

## Chebyshev Bounds: Another Example

In [None]:
#http://inferentialthinking.com/notebooks/baby.csv
births = Table.read_table('baby.csv')
births.hist(2)

In [None]:
avg = np.mean(births.column('Maternal Age'))
sd = np.std(births.column(2))
avg, sd

In [None]:
# proportion of rows where the ages in between average +/- 2 std 
(births
 .where(2, are.between(avg - 2*sd, avg + 2*sd))
 .num_rows
 / births.num_rows)