In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore',category=np.VisibleDeprecationWarning)

## Lecture 10 ##

## Prediction ##

In [None]:
families = Table.read_table('family_heights.csv')
families

In [None]:
# Let's combine the data about father's height and mother's height by averaging, then add this to the table.
# Let's build the array whose entries are the averages of the father's and mother's heights:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
# Let's build a table 'heights' with the parent average height, the child height, and the sex of the child
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'Sex', families.column('sex')
)
heights

In [None]:
# What kind of plot should we use to see if there is an association between parent average and child?
heights.scatter('Parent Average', 'Child')

In [None]:
# Let's look at the same plot, and highlight data where parent average height between 67.5 and 68.5 in.
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
# What's the average child height for children of parents whose average height is between 67.5 and 68.5?
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

In [None]:
# Let's add a dot to our plot signifying this average.  This average is a good *prediction* of the child's height 
# if we know the average height of the parent is between 67.5 and 68.5.
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

In [None]:
# Let's make similar predictions for the other data in the heights table.
# We'll begin by defining a function `predict()` which does what we did above.
def predict(h):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    return np.average(nearby.column('Child'))

In [None]:
# We can call predict() on many inputs:
predict(68)

In [None]:
predict(70)

In [None]:
# though we may get some errors if there is no data
predict(120)

In [None]:
# Here is where we will use the apply method!  Call the function predict on every entry of `Parent Average` column
predicted_heights = heights.apply(predict, 'Parent Average')

In [None]:
# Let's add our predictions to the heights table
heights = heights.with_column('Prediction', predicted_heights)

In [None]:
# Let's make a scatter plot which includes our predictions.
heights.select('Parent Average', 'Child', 'Prediction').scatter('Parent Average')

## Prediction Accuracy ##

In [None]:
# One way to measure the error in our prediction is to simply take the difference between our prediction of 
# a child's height, and the actual height:


In [None]:
# Let's compute this error for every row in the table using apply, then add a column of errors in the table:


In [None]:
# Let's do a simple histogram of the errors.


In [None]:
# We can plot a pair of histograms, one for each value of 'Sex', to see if there are differences in errors for
# these two groups.


# Discussion Question

In [None]:
# We see there is a difference in the errors of female children's height and male children's height.
# We can improve our predictions by incorporating information about Sex.



In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
# We can use the apply method with two arguments by specifiying two columns of the table in order:


In [None]:
# Let's again analyze the error, but now of our smarter prediciton:


## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv').drop('Color')
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', np.average)

In [None]:
cones.group('Flavor', np.min)

## Grouping By One Column: NBA Salaries ##

In [None]:
nba = Table.read_table('nba_salaries.csv')
nba.show(3)

In [None]:
nba.hist('salary')

In [None]:
# Let's group players by position and compute the averages of the other columns
by_position = nba.group('position', np.average)
by_position 

In [None]:
# Let's make a bar chart showing salary average by position
by_position.barh('position', 'salary average')

In [None]:
by_position = by_position.sort('salary average', descending = True)
by_position

In [None]:
by_position.barh('position', 'salary average')

In [None]:
# Have NBA salaries increased by year?
nba_by_year = nba.group('season', np.average)
nba_by_year

In [None]:
# Plot!
nba_by_year.plot('season', 'salary average')

## Lists

In [None]:
[1, 5, 'hello', 5.0]

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

## Grouping by Two Columns ##

In [None]:
# Let's try grouping by TWO columns.
nba.group(['position', 'season'], np.average).show()

In [None]:
# This breaks the dataset up into many groups, one group for each possible value of position AND season.
