In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Random Sampling

In [None]:
# Run this cell to load the table
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

### Some deterministic samples:

In [None]:
# No chance involved in this sample
united.where('Destination', 'JFK') 

In [None]:
# If I run the command again, I will get identical results.  This is why we say this is a deterministic sample
united.where('Destination', 'JFK') 

In [None]:
# Also deterministic, even if it feels like we are choosing the numbers "randomly"
united.take(make_array(34, 6321, 10040))

### A random sample:

In [None]:
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

## Empirical Distribution and the Probability Distribution

In [None]:
# Let's simulate a die roll.  First let's put the possible outcomes into a table:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
# With a table, we can use the method 'sample' to randomly choose rows from the table (with replacement)
die.sample(10)

In [None]:
# Define some bins:
die_bins = np.arange(0.5, 6.6, 1)

In [None]:
# The probability distribution of the die
die.hist(bins = die_bins)

In [None]:
# The empirical distribution of 10 die rolls.  Notice that this changes every time we run this cell.
die.sample(10).hist(bins=die_bins)

In [None]:
# When we increase the number of samples, the empirical distribution looks more like the probability distribution!
die.sample(1000).hist(bins=die_bins)

## Statistics

In [None]:
# Run this cell
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')

In [None]:
# (Population) Parameter
# Let's find the median value of 'Delay' for the entire "population" of flights in the dataset.
np.median(united.column('Delay'))

In [None]:
# (Sample) Statistic
# Let's first sample the data, then take the median of the sample.  This is a sample statistic.  We use it 
# to estimate the population parameter.
np.median(united.sample(10).column('Delay'))

In [None]:
# (Sample) Statistic with larger sample size:
np.median(united.sample(100).column('Delay'))

### Probability & Empirical Distributions of a Statistic

In [None]:
# Let's make a function which returns the median value of 'Delay' in a random sample of size 'size'
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
num_simulations = 2000

In [None]:
# Let's run sample_median(10) 'num_simulations' times, and record all the results in an array:
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
# Let's make a histogram of the data
Table().with_column('Sample medians (size=10)', sample_medians).hist(bins=20)

In [None]:
# Let's instead run sample_median(1000) num_simulation times.
sample_medians = make_array()

for i in np.arange(num_simulations):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians (size=1K)', sample_medians).hist()

#### Empirical Distributions of a Statistic (Overlayed)

In [None]:
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

## Swain vs. Alabama ##

In [None]:
# Let's make an array of population proportions in Talladega County in 1965 (26% black, 74% non-black)
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
# Let's use the sample_proportions function to randomly sample 100 times from a population with the above 
# distribution.
sample_proportions(100, population_proportions)

In [None]:
# Let's define a function which will randomly sample from the popultion and return the proportion of black jurors
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
# Let's run this 10000 times and record results:
panels = make_array()

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
# Let's make a histogram:

Table().with_column(
    'Number of Black Men on Panel of 100', panels
).hist(bins=np.arange(5.5,40.))

# Plotting details; ignore this code
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);

## Mendel and Pea Flowers ##

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers.  
observed_purples = 709 / 929
observed_purples

In [None]:
# Does the evidence Mendel collected represent evidence against the model that 75% of flowers are purple?
# Let's simulate 
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
purple_flowers()

In [None]:
purples = make_array()

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)

In [None]:
Table().with_column('Percent of purple flowers in sample of 929', purples).hist()

In [None]:
Table().with_column('Discrepancy in sample of 929 if the model is true', abs(purples- 75)).hist()

In [None]:
abs(observed_purples * 100 - 75)