In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 15 ##

## Simulating Dice Rolls ##

In [None]:
# Let's simulate a coin flip by defining a function:

In [None]:
# Let's flip the coin 100 times, then count the number of heads.

In [None]:
# Let's define a function num_heads() which flips a coin 100 times, then counts the number of heads.

In [None]:
# Decide how many times you want to repeat the experiment

In [None]:
# Simulate that many outcomes by writing a for-loop which will run the experiment the number of times specified.

In [None]:
# Let's make a histogram!

## Random Sampling ##

We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.

In [None]:
# Run this cell to load the table
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

### Some deterministic samples:

In [None]:
# No chance involved in this sample
united.where('Destination', 'JFK') 

In [None]:
# If I run the command again, I will get identical results.  This is why we say this is a deterministic sample
united.where('Destination', 'JFK') 

In [None]:
# Also deterministic, even if it feels like we are choosing the numbers "randomly"
united.take(make_array(34, 6321, 10040))

### A random sample:

In [None]:
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

## Distributions ##

In [None]:
# Let's simulate a die roll.  First let's put the possible outcomes into a table:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
# With a table, we can use the method 'sample' to randomly choose rows from the table (with replacement)
die.sample(10)

In [None]:
# The probability distribution of the die
die.hist()

In [None]:
# The above is ugly because we are trying to use 10 bins for 6 outcomes.  Let's fix:
roll_bins = np.arange(0.5, 6.6, 1)

In [None]:
die.hist(bins=roll_bins)

In [None]:
# The empirical distribution of 10 die rolls.  Notice that this changes every time.
die.sample(10).hist(bins=roll_bins)

In [None]:
# When we increase the number of samples, the empirical distribution looks more like the probability distribution!
die.sample(1000).hist(bins=roll_bins)

In [None]:
die.sample(100000).hist(bins=roll_bins)

## Large Random Samples ##

In [None]:
united 

In [None]:
united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)

In [None]:
min(united.column('Delay'))

In [None]:
max(united.column('Delay'))

In [None]:
np.average(united.column('Delay'))

In [None]:
united.sample(10).hist('Delay', bins = united_bins)

In [None]:
united.sample(1000).hist('Delay', bins = united_bins)

## Simulating Statistics ##

In [None]:
np.median(united.column('Delay'))

In [None]:
np.median(united.sample(10).column('Delay'))

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column(
    'Sample medians', sample_medians).hist(bins = np.arange(-10,31))