# Lecture 18: Decisions and Uncertainty

In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
from IPython.core.display import HTML
HTML('''<link href='http://fonts.googleapis.com/css?family=Lora:400,700,400i,700i' rel='stylesheet'><link href='https://fonts.googleapis.com/css?family=Lato:300,400,700,300i,400i,700i' rel='stylesheet'><link href='https://fonts.googleapis.com/css?family=Inconsolata:400' rel='stylesheet'><link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css"><style>h1, h2, h3, h4, h5 { font-family: 'Lato', sans-serif; } h5 { font-style: normal; } kbd { font-family: Lato, serif; } hr { border-width: 2px; border-color: #a9a9a9; } .cite { font-size: 85%; text-align: right; margin-top: 10px; } .note { font-family: Lora, serif; font-size: 10pt; font-weight: 400; margin-top: 0; margin-bottom: 0; } h5.prehead { font-family: Lato, serif; font-style: normal; font-size: 14pt; font-weight: 300; margin-bottom: 15px; margin-top: 30px; } h5.lesson { font-family: Lato, serif; font-weight: 400; font-size: 15pt; font-style: normal; margin-top: 0px; margin-bottom: 5px; } h1.lesson_title { font-family: Lato, serif; font-weight: 300; font-size: 32pt; line-height: 110%; color:#CD2305; margin-top: 0px; margin-bottom: 15px; } div.cell{ max-width: 1120px; margin-left: auto; margin-right: auto; } div.text_cell_render { font-family: Lora, serif; line-height: 160%; font-size: 13pt; } .rendered_html pre, .rendered_html code  { font-family: Inconsolata, monospace !important; font-size: 13pt; } div.CodeMirror, div.output_area pre, div.prompt { font-family: Inconsolata, monospace !important; font-size: 125%; } .rendered_html ul li { margin-top: 0.75em; margin-bottom: 0.75em; } .rendered_html ul li ul li { margin-top: 0.5em; margin-bottom: 0.5em; } .rred { color: #a00000; } </style> <script> MathJax.Hub.Config({ TeX: { extensions: ["AMSmath.js"] }, tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ] }, displayAlign: 'center', // Change this to 'center' to center equations. "HTML-CSS": { styles: {'.MathJax_Display': {"margin": "0.75em 0"}} } }); </script>''')

# Student performance

Let's look at a set of student scores.  The table we'll load was downloaded [from Kaggle](https://www.kaggle.com/spscientist/students-performance-in-exams). 

In [None]:
student=Table.read_table('StudentsPerformance.csv')
student

In [None]:
#Find mean of student math scores

student.column('math score').mean()

In [None]:
#Use .group method to find math score averages for each race/ethnicity group

student.group('race/ethnicity', np.average).select('race/ethnicity', 'math score average')

###  Null hypothesis: The low math score average for group C was due only to chance; that is, if we would have picked the same size group at random from the population of students, we could have got an average like this one. 

###  Alternative hypothesis: The low math score average for group C was not due to chance.  There were some other factors that were making these scores low.  

In [None]:
#Find observed average of math scores for group C

observed_average = student.where('race/ethnicity', are.equal_to('group C')).column('math score').mean()

In [None]:
#Find number of students in group C

sample_size = student.where('race/ethnicity', are.equal_to('group C')).num_rows
sample_size

### Statistic: average math score of a sample of 319 students

In [None]:
#Generate random sample of all students of size equal to group C size

student_sample = student.sample(sample_size)
student_sample

In [None]:
# Define function student_random_sample_math to generate random sample 
# of all students of size equal to group C size, and return average math score of this sample

def student_random_sample_math():
    random_sample = student.sample(sample_size)
    return np.average(random_sample.column('math score'))

In [None]:
# Test student_random_sample_math function

student_random_sample_math()

In [None]:
# Make array of 10,000 repetitions of student_random_sample_math

sample_averages = make_array()

repetitions=10000
for i in np.arange(repetitions):
    sample_averages = np.append(sample_averages, student_random_sample_math())   

In [None]:
# Make table of above repetitions; generate histogram 

averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins=np.arange(62,70,0.2))
plots.scatter(observed_average, -0.001, color = 'red', s=40);
#plots.plot([observed_average, observed_average], [0, .08], color='red', lw=2)

We can quantify the position of the observed data in the histogram by calculating what percentage of the histogram area is to the left of it. This percentage will be the proportion of simulated statistic values to the left of the observed data

In [None]:
sum(sample_averages <= observed_average) / 10000

This chance is the **observed significance level** of the test. It's also commonly called the **P-value** of the test.