In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

plt.style.use('fivethirtyeight')

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
   "livereveal", {
       'width': 1200,
       'height': 900,
       "scroll": True,
})

# Lecture 14

### CI's for Hypothesis Testing, Center and Spread

## Assignments

1. Lab due tomorrow
2. HW due Monday
3. Project 2 is out - get started!

## 95% CI's from 200 different bootstrap medians

![Bootstrap200.png](attachment:Bootstrap200.png)

## Discussion Question

About how many of these 200 confidence intervals do not contain the population median?    

A. 5  
B. 10  
C. 95  
D. 190  

# Confidence Intervals for Hypothesis Testing

## Using a CI for testing

* Null hypothesis: Population average = x
* Alternative hypothesis: Population average ≠ x
* Cutoff for P-value: p%
* Method:
    - Construct a (100-p)% confidence interval for the population average
    - If x is not in the interval, reject the null
    - If x is in the interval, can’t reject the null

### Average Total Pay of City of San Diego Employees

* Null hypothesis: Average total pay of all SD city employees is $\$73,000$  
* Alternative hypothesis: Average total pay of all SD city employees is not $\$73,000$
* Cutoff for P-value: 5%
* Method:
    - Construct a 95% confidence interval for the population average
    - If $\$73,000$ is not in the interval, reject the null
    - If $\$73,000$ is in the interval, can’t reject the null

In [None]:
#: read in the data
population = Table.read_table('salaries.csv')
population

In [None]:
#: take a sample of size 500
sample = population.sample(500, with_replacement=False)
sample_mean = np.mean(sample.column('Total Pay'))

n_resamples = 5000

boot_means = make_array()
for i in range(n_resamples):
    # perform bootstrap resampling
    resample = sample.sample(500, with_replacement=True)
    
    # compute the mean
    mean = np.mean(resample.column('Total Pay'))
    
    # tack it on to our list of means
    boot_means = np.append(boot_means, mean)

left = percentile(2.5, boot_means)
right = percentile(97.5, boot_means)
[left, right]

In [None]:
#: visualize
Table().with_column('Bootstrap Means', boot_means).hist(unit='$')
plt.plot([left, right], [0, 0], color='lime', linewidth=15, alpha=0.9, zorder=2)

### Among City of SD employees, do men and women make similar total pay?

* Null hypothesis: Median total pay of all employees is the same as the median total pay of male employees
* Alternative hypothesis: Median total pay of all employees is not the same as the median total pay of male employees
* Cutoff for P-value: 5%
* Method:
    - Construct a 95% confidence interval for the median total pay of all employees
    - If median total pay of males is not in the interval, reject the null
    - If median total pay of males is in the interval, can’t reject the null
    
Analogous hypothesis test for female employees

In [None]:
population

### How do we create a column labeling sex?
* Download a list of baby names from the internet!
* Join to the SD employee data
    - Caution: this join isn't perfect!

In [None]:
names = Table.read_table('baby-names.csv')
names

## Making a list of names and associated sex
* Careful of unisex names!
* Use the sex that is most often associated to the name

## Approach:
* Count number of times each name is used for male/female; take most common.
* Trick: calculate # of times name was used as a male name minus # of times names was used as a female name
    - If positive, predominantly male
    - If negative, predominantly female

In [None]:
signs = np.where(names.column('sex')=='boy', 1, -1)
signs

In [None]:
names = names.with_column('sign', signs)
names

In [None]:
names = names.with_column('signed number', names.column('number')*names.column('sign'))
names

In [None]:
name_map = names.select('name', 'signed number')
name_map

In [None]:
name_map = name_map.group('name', np.sum)
name_map 

In [None]:
sexes = np.where(name_map.column('signed number sum') > 0, 'M', 'F')
sexes

In [None]:
# Table with one row per name, and predominant sex associated to that name.

name_map = name_map.with_column('sex', sexes).select('name', 'sex')
name_map

In [None]:
# Need to extract first names from SD data

def get_firstname(s):
    return s.split()[0]

get_firstname("David P Gerboth")

In [None]:
# Add a First Name column to join SD data to names
population = population.with_column('First Name', population.apply(get_firstname, 'Employee Name'))
population

In [None]:
# Join tables
pop_with_sex = population.join('First Name', name_map, 'name' ).drop('First Name')
pop_with_sex

In [None]:
median_pay_breakdown = pop_with_sex.select('Total Pay', 'sex').group('sex', np.median)
median_pay_breakdown

In [None]:
women = median_pay_breakdown.column(1).item(0)
men = median_pay_breakdown.column(1).item(1)

###  Testing the Hypothesis

* Null hypothesis: Median total pay of all employees is the same as the median total pay of male employees
* Alternative hypothesis: Median total pay of all employees is not the same as the median total pay of male employees
* Cutoff for P-value: 5%
* Method:
    - Construct a 95% confidence interval for the median total pay of all employees
    - If median total pay of males is not in the interval, reject the null
    - If median total pay of males is in the interval, can’t reject the null
    
Analogous hypothesis test for female employees

In [None]:
#: take a sample of size 500
sample = population.sample(500, with_replacement=False)
sample_median = np.median(sample.column('Total Pay'))

n_resamples = 5000

boot_medians = make_array()
for i in range(n_resamples):
    # perform bootstrap resampling
    resample = sample.sample(500, with_replacement=True)
    
    # compute the median
    median = np.median(resample.column('Total Pay'))
    
    # tack it on to our list of medians
    boot_medians = np.append(boot_medians, median)

In [None]:
left = percentile(2.5, boot_medians)
right = percentile(97.5, boot_medians)
[left, right]

In [None]:
#: visualize
Table().with_column('Bootstrap Medians', boot_medians).hist(unit='$')
plt.scatter(women, 0, color='blue', s=80, zorder=2)
plt.scatter(men, 0, color='red', s=80, zorder=2)
plt.plot([left, right], [0, 0], color='lime', linewidth=5, zorder=1)
plt.legend(['95% CI', 'Women (Median)', 'Men (Median)'])

### The median salaries of men/women are:
* significantly higher/lower than the population median.
* The test doesn't state *why* they're different.
* City employee salaries are fixed for any given job title, amount of experience.
    - Disparity is due to gender imbalance of job-types.
    - Question: FT/PT analysis?

### Job title: fire related

In [None]:
population

In [None]:
# gender breakdown
pop_with_sex.where('Job Title', are.containing('Fire')).group('sex')

In [None]:
# median pay
np.median(pop_with_sex.where('Job Title', are.containing('Fire')).column('Total Pay'))

### Job title: library related

In [None]:
# gender breakdown
pop_with_sex.where('Job Title', are.containing('Lib')).group('sex')

In [None]:
# median pay
np.median(pop_with_sex.where('Job Title', are.containing('Lib')).column('Total Pay'))

# Center and Spread

## Questions 
* How can we quantify natural concepts like “center” and “variability”?
* Why do many of the empirical distributions that we generate come out bell shaped?
* How is sample size related to the accuracy of an estimate?

## The Average (or Mean)

Given Data: $2, 3, 3, 9$, the average (or mean) is:
$$\rm{Average } = \frac{2 + 3 + 3 + 9}{4} = 4.25$$

## The Average (or Mean)

* Need not be a value in the collection
* Need not be an integer even if the data are integers
* Somewhere between min and max, but not necessarily halfway in between
* Same units as the data.
* Smoothing operator: collect all the contributions in one big pot, then split evenly

### Discussion Question

Create a data set that has this histogram. (You can do it with a short list of whole numbers.) 

![image.png](attachment:image.png)

What are its median and mean?

### Discussion Question

Are the medians of these two distributions the same or different? Are the means the same or different? If you say “different,” then say which one is bigger.

![image.png](attachment:image.png)

A. same means and medians  
B. same means, different medians  
C. different means, same medians  
D. different means and medians  

### Answer

In [None]:
tbl1 = Table().with_column('value', 
                           [1] + [2]*2 + [3]*3 + [4]*2 + [5]
                          )

tbl2 = Table().with_column('value', 
                           [1] + [2]*2 + [3]*3 + [4]*2 + [10]
                          )

In [None]:
#:
print(
    'median #1:\t%f' % np.median(tbl1.column('value')),
    'median #2:\t%f' % np.median(tbl2.column('value')),
    'mean #1:\t%f' % np.mean(tbl1.column('value')),
    'mean #2:\t%f' % np.mean(tbl2.column('value')),
    sep='\n'
)

## Comparing Mean and Median
* Mean: Balance point of the histogram
* Median: Half-way point of data; half the area of histogram is on either side of median
* If the distribution is symmetric about a value, then that value is both the average and the median.
* If the histogram is skewed, then the mean is pulled away from the median in the direction of the tail.

### Discussion Question
![image.png](attachment:image.png)

In [None]:
nba = Table.read_table('nba2013.csv')
print('mean:\t%f' % nba.column('Height').mean())
print('median:\t%f' % np.median(nba.column('Height')))
nba.hist('Height')

# Standard Deviation: Measuring Spread

### How can we quantify how spread out a distribution is?

**Plan A:** “biggest value - smallest value”
* Doesn’t tell us much about the shape of the distribution
* Are extreme values rare or common?

**Plan B:**
* Measure variability around the mean
* Need to figure out a way to quantify this

### Deviations from the mean

In [None]:
values = make_array(2, 3, 3, 15)
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
# average value
average_value = np.mean(sd_table.column(0))
average_value

In [None]:
# Deviations from average: signed 
# (negative indicates left of mean, positive indicates right of mean)
deviations = values - average_value
sd_table = sd_table.with_column('Mean', average_value).with_column('Deviation', deviations)
sd_table

In [None]:
# Sum of the deviations?
sum(deviations)

In [None]:
# squared difference
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data
variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
# Standard Deviation (SD) is the square root of the variance
sd = variance ** 0.5
sd

## Standard Deviation
* numpy function: `np.std`
* Standard deviation (SD) measures roughly how far the data are from their average
* SD has the same units as the data

In [None]:
np.std(values)

## Why use the SD?

No matter what the shape of the distribution, the bulk of the data are in the range “average ± a few SDs”

## How many standard deviations away from average is each person's salary?

In [None]:
pay = population.select('Total Pay')
pay

In [None]:
avg = pay.column(0).mean()
avg

In [None]:
pay = pay.with_column("Dollars Above Average", pay.column(0) - avg)
pay

In [None]:
sd = np.std(pay.column(0))
sd

In [None]:
pay = pay.with_column("SDs Above Average", pay.column(1)/sd)
pay

In [None]:
pay.bin('SDs Above Average', bins = np.arange(-3, 8, 1)).show()

## Chebyshev’s Inequality

No matter what the shape of the distribution, the proportion of values in the range “average ± z SDs” is at least 

$$1 - \frac{1}{z^2}$$

## Chebyshev's Bounds

|Range|Proportion|
|---|---|
|average ± 2 SDs|	at least 1 - 1/4   (75%)|
|average ± 3 SDs|	at least 1 - 1/9   (88.888…%)|
|average ± 4 SDs|	at least 1 - 1/16 (93.75%)|
|average ± 5 SDs|	at least 1 - 1/25  (96%)|

No matter what the distribution is!