In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

## Bootstrap review: Santa Barbara County Salaries in 2020

[Government Compensation in California](https://publicpay.ca.gov/Reports/Counties/County.aspx?entityid=42&year=2020)
Let's look at how much Santa Barbara County employees got paid in 2020.

In [None]:
sb_pop = Table.read_table('data/santabarbara-county-salaries-2020.csv').where('TotalWages', are.above(10000))
sb_pop = sb_pop.where('TotalWages', are.above(10*40*52))#.sort('TotalWages')
pop_median = percentile(50, sb_pop.column('TotalWages'))
print("Population median is $", pop_median)

our_sample = sb_pop.sample(300, with_replacement = False)
sb_pop.sort('TotalWages', descending = True)
print('original sample median', np.median(our_sample.column('TotalWages')))

# Total Wages, includes Regulary Pay, Overtime Pay, lump sum pay, other pay
# does NOT include benefits

In [None]:
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('TotalWages'))

med_bins = np.arange(50000, 100001, 2500)


In [None]:
## Let's do the whole process, all at once: 
## 1. take a sample, 

our_sample = sb_pop.sample(300, with_replacement = False)
our_sample_median = percentile(50, our_sample.column('TotalWages'))


## 2. use it to make many bootstrap samples,

bootstrap_medians = make_array()
for i in np.arange(201):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)


## 3. and use the middle 95% of the bootstrap medians as our confidence interval

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column('Bootstrap Medians', bootstrap_medians).hist('Bootstrap Medians', bins=med_bins)
plots.plot([left, right], [0,0], color="gold",lw=4, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);

## Bootstrap Review: Mean Maternal Age

In [None]:
births = Table.read_table('data/baby.csv')
births.show(5)

In [None]:
births.hist('Maternal Age')

In [None]:
mean_age = np.mean(births.column('Maternal Age'))
mean_age

In [None]:
# can you repeat the steps we did in the previous sectino?

# define a function for capturing a bootstrap mean, call it one_bootstrap_mean()
def one_bootstrap_mean():
    our_sample = births.sample()
    maternal_age = our_sample.column('Maternal Age')
    return np.mean(maternal_age)
one_bootstrap_mean()

In [None]:
# generate 1000 bootstrap samples, and capture each of their means
bootstrap_means = make_array()
for i in np.arange(1000):
    one_mean = one_bootstrap_mean()
    bootstrap_means = np.append(bootstrap_means, one_mean)



# define your middle 95% percentile, to show your 95 confidence interval below
left = percentile(2.5, bootstrap_means) #left end point of my confidence interval
right = percentile(97.5, bootstrap_means) # right end point of my confidence interval


In [None]:
# generate a plot that shows your distribution of bootstrap means
# and your 95% confidence interval
# also plot the original mean_age onto the plot

Table().with_column('Bootstrap means', bootstrap_means).hist()

# add population parameter/ original sample statistic
plots.scatter(mean_age, 0, color = "blue", zorder = 2)
# add confidence interval
plots.plot([left, right], [0,0], color ="gold", lw = 6, zorder = 1)





## Average (Mean) ##

In [None]:
values = make_array(2, 3, 3, 9)

In [None]:
sum(values)/len(values)

In [None]:
np.average(values)

In [None]:
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

In [None]:
2*(1/4) + 3*(2/4) + 9*(1/4)

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist(0, bins = bins_for_display)

In [None]:
## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

In [None]:
np.average(values)

In [None]:
np.average(new_vals)

### Discussion Question

In [None]:
nba = Table.read_table('data/nba2013.csv')
nba

In [None]:
nba.hist('Height', bins=np.arange(65.5, 90.5))

In [None]:
# can you compare the median height






In [None]:
# to the average height?



# which is bigger?

## Standard Deviation ##

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.average(sd_table.column(0))
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

In [None]:
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data

variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
# Standard Deviation (SD) is the square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.std(values)