In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

# Lecture 25 #

## Average (Mean) ##

In [None]:
# Let's make a simple array
values = make_array(2, 3, 3, 9)
values

In [None]:
# Here's a pretty standard way to compute the average of the entries in an array:
sum(values)/len(values)

In [None]:
# Here's a faster way:
np.average(values)

In [None]:
# Here's an even faster way (less keystrokes):
np.mean(values)

In [None]:
# You can do it by hand...
(2 + 3 + 3 + 9)/4

In [None]:
# Which, if you rewrite using rules of arithmetic, shows how we are "weighting" each unique entry in the array
# by the frequency with which it occurs in the array:
2*(1/4) + 3*(2/4) + 9*(1/4)

In [None]:
# Rewritten with decimals instead of fractions
2*0.25 + 3*0.5 + 9*0.25

In [None]:
# Let's make a table from our array:
values_table = Table().with_columns('value', values)
values_table

In [None]:
# Now we can make a histogram:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)

In [None]:
## Make array of 10 2s, 20 3s, and 10 9s
# Notice that the *proportion* of each entry in this new array is the same as the 
# proportions of those entries in the old array.
new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

In [None]:
# Compute average of old array just to remind ourselves:
np.average(values)

In [None]:
# Compute average of new array:
np.average(new_vals)

In [None]:
# Make histogram:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

#--

In [None]:
# You can think of the mean as telling you about the center of gravity of the distribution:
Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Gravity');

## Standard Deviation ##

In [None]:
# Let's start building out a table of standard deviations:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
# Let's compute the average (aka mean) of the values:
average_value = np.mean(values)
average_value

In [None]:
# Let's compute the *deviations* of each value.  This is simply each value minus the average value.
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
# Notice that the sum of deviations is 0.  This is ALWAYS the case, no matter what the data or distribution is.
sum(deviations)

In [None]:
# A better, more standard measure of variation of each value from the mean is the the squared deviation:
# We are going to compute it by squaring the deviations:
sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data: mean squared deviation from average
# So we compute the mean of the squared deviations:

variance = np.mean(deviations ** 2)
variance

In [None]:
# Standard Deviation (SD): 
# root mean squared deviation from average
# = square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.std(values)

## Chebyshev's Bounds ##

In [None]:
# The Chebyshev bounds tell us what proportion of the data is guaranteed to be within some number of 
# standard deviations from the mean, no matter what the data or distribution is!
births = Table.read_table('baby.csv').drop('Maternal Smoker')

In [None]:
births.labels

In [None]:
births.hist(overlay = False)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# This proportion should be at least

1 - 1/3**2

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%')

## Standard Units ##

In [None]:
# Standard units is a way of re-expressing a set of data for a particular variable in units of standard deviation.
# Let's write a function to compute it:

def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')
ages

In [None]:
ages_standard_units = standard_units(ages)
ages_standard_units

In [None]:
# Notice that the mean of the data written in standard units is 0.  This is ALWAYS true.
# Notice also that the standard deviation of the data written in standard units is 1.  This is ALWAYS true.
np.mean(ages_standard_units), np.std(ages_standard_units)


## Discussion Question 

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell Shaped Curves 

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
plots.xticks(np.arange(57, 72, 2));

Estimates by eye

The average is approximately: 

Locate the point of inflection on the right. The SD is approximately:

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)