In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 10 ##

## Prediction ##

In [None]:
#The data below are Galton's carefully collected measurements on the heights of parents and their adult children. 
#Each row corresponds to one adult child. The variables are a numerical code for the family, the heights (in inches) 
#of the father and mother, a "midparent height" which is a weighted average of the height of the two parents, 
#the number of children in the family, as well as the child's birth rank (1 = oldest), gender, and height.
galton = Table.read_table('galton.csv')
galton

In [None]:
#to predict the adult height of a child born to parents similar to those in the dataset. 
#Let us try to do this, using midparent height as the variable on which to base our prediction. 
#Thus midparent height is our predictor variable.
galton.scatter('midparentHeight', 'childHeight')
#galton.scatter('midparentHeight', 'childHeight',color=['red','green'])


In [None]:
#What would be a good way for him to go about predicting the child's height, given that the midparent height was, 
#say, 68 inches?

#One reasonable approach would be to base the prediction on all the points that correspond to a midparent height of 
#around 68 inches. The prediction equals the average child's height calculated from those points alone.
#We will take "close" to mean "within half an inch". The figure below shows all the points corresponding to a 
#midparent height between 67.5 inches and 68.5 inches. 

galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);
#plots.scatter(68, 66.24, color='gold', s=40);

In [None]:
#In order to calculate exactly where the gold dot should be, we first need to indentify all the points in the strip. 
#These correspond to the rows where MidParent is between 67.5 inches and 68.5 inches.
nearby = galton.where('midparentHeight', are.between(67.5, 68.5))

#The predicted height of a child who has a midparent height of 68 inches is the average height of the 
#children in these rows. That's 66.24 inches.

nearby_mean = nearby.column('childHeight').mean()
nearby_mean

In [None]:
galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='gold', s=50);

In [None]:
def predict(h):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    return nearby.column('childHeight').mean()

In [None]:
predict(68)

In [None]:
predict(70)

In [None]:
predict(73)

In [None]:
predicted_heights = galton.apply(predict, 'midparentHeight')
predicted_heights

In [None]:
galton = galton.with_column('predictedHeight', predicted_heights)
galton

In [None]:
#galton.select('midparentHeight', 'childHeight', 'predictedHeight').scatter('midparentHeight')

galton.select('midparentHeight', 'childHeight', 'predictedHeight').scatter('midparentHeight')
#heights_with_predictions = galton.with_column('PpredictedHeight', galton.apply(predict, 'midparentHeight'))
#heights_with_predictions.scatter('midparentHeight')

## Prediction Accuracy ##

In [None]:
def difference(x, y):
    return x - y

In [None]:
pred_errs = galton.apply(difference, 'childHeight', 'predictedHeight')
pred_errs
#galton = galton.with_column('err', pred_errs)
#galton

In [None]:
galton.hist('errors')

In [None]:
galton = galton.with_column('errors',pred_errs)
galton


In [None]:
galton.hist('errors')

In [None]:

galton.hist('errors', group='gender')

# Discussion Question

In [None]:
def predict_smarter(h, g):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    nearby_same_gender = nearby.where('gender', g)
    return nearby_same_gender.column('childHeight').mean()

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = galton.apply(predict_smarter, 'midparentHeight', 'gender')
galton = galton.with_column('smartPredictedHeight', smarter_predicted_heights)

In [None]:
smarter_pred_errs = galton.apply(difference, 'childHeight', 'smartPredictedHeight')
galton = galton.with_column('smartErrors', smarter_pred_errs)

In [None]:
galton.hist('smartErrors', group='gender')

## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv')

In [None]:
cones

In [None]:
#There are two distinct categories, chocolate and strawberry. The call to group creates a table of counts in each category. 
#The column is called count by default, and contains the number of rows in each category.
cones.group('Flavor')

In [None]:
cones.drop('Color').group('Flavor', np.average)

In [None]:
#why drop color
cones.group('Flavor', np.average)

In [None]:
cones.drop('Color').group('Flavor', min)

##The group method also allows us to classify individuals according to multiple variables. This is called cross-classifying.##


In [None]:
more_cones = Table().with_columns(
    'Flavor', make_array('strawberry', 'chocolate', 'chocolate', 'strawberry', 'chocolate', 'bubblegum'),
    'Color', make_array('pink', 'light brown', 'dark brown', 'pink', 'dark brown', 'pink'),
    'Price', make_array(3.55, 4.75, 5.25, 5.25, 5.25, 4.75)
)
more_cones
#more_cones


In [None]:
more_cones.group('Flavor')

In [None]:
#But now each cone has a color as well. To classify the cones by both flavor and color, 
#we will pass a list of labels as an argument to group. 
#The resulting table has one row for every unique combination of values that appear together 
#in the grouped columns. 
#Although there are six cones, there are only four unique combinations of flavor and color. 
#Two of the cones were dark brown chocolate, and two pink strawberry.

more_cones.group(['Flavor', 'Color'])

In [None]:
#A second argument aggregates all other columns that are not in the list of grouped columns.
more_cones.group(['Flavor', 'Color'], sum)

In [None]:
#what happens if you dont include color
more_cones.group(['Flavor'], sum)

## Lists

In [None]:
[1, 5, 'hello', 5.0]

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

## Pivot Tables

Pivot tables, also known as contingency tables, make it easier to work with data that 
have been classified according to two variables.


In [None]:
more_cones.group(['Flavor', 'Color'])

In [None]:
more_cones.pivot('Flavor', 'Color')
#try switching the order

In [None]:
#pivot being used to find the total price of the cones in each cell.
more_cones.pivot('Flavor', 'Color', values='Price', collect=sum)

In [None]:
#here is group doing the same thing.
more_cones.group(['Flavor', 'Color'], sum)


#Though the numbers in both tables are the same, table produced by pivot is easier to read and lends 
#itself more easily to analysis. The advantage of pivot is that it places grouped values into adjacent columns, 
#so that they can be combined and compared.
