In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## The GSI's Defense ##

In [None]:
scores = Table.read_table('data/scores_by_section.csv')
scores
scores.sort('Midterm', descending = True)
# max score of 25 on midterm

In [None]:
np.average(scores.column('Midterm'))
# entire class' average midterm score

In [None]:
scores.group('Section').show()
# section 3 which complained about their low midterm score
# has 27 students

In [None]:
scores.group('Section', np.average).sort('Midterm average').show()

In [None]:
observed_average = 13.6667 
# observed test statistic
# avg midterm score for section 3

# GOAL:
# simulate under the null (grad students perspective)
# and see if we can ever simulate the observed test statistic
# which is avg midterm score of 13.6667


# if we are able to simulate the obs. test statistic a bunch, this makes us want to side with the grad student
# this means our null hypothesis and our data are consistent


# if we aren't able to simulate the obs. test. stat. at all, it'll make us lean towards the UG perspective
# this means our null hypothesis and our data are INCONSISTENT


In [None]:
random_sample = scores.sample(27, with_replacement=False)
random_sample
# simulate a new section 3 from entire class roster

In [None]:
np.average(random_sample.column('Midterm'))

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that the section is like a random sample from the class

def random_sample_midterm_avg():
    random_sample = scores.sample(27, with_replacement = False)
    return np.average(random_sample.column('Midterm'))

In [None]:
# Simulate 50,000 copies of the test statistic

sample_averages = make_array()

for i in np.arange(50000):
    sample_averages = np.append(sample_averages, random_sample_midterm_avg())    

In [None]:
dist = abs(sample_averages - observed_average)
dist


In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic

#averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl = Table().with_column('Abs Distance of Random Sample Average from Observed Average', dist)

averages_tbl.hist(bins = 20)
plots.scatter(0, 0, color = 'red', s=40);

# were we able to simulate the observed test statistic?

# is our data consistent with our null hypothesis?


# who do you side with? Grads or UGs?
# not sure yet....


# were we able to simulate the observed test statistic enough times for us to side with Grads? (null hypothesis)



In [None]:
# test statistic = average midterm score
# obs test stat = 13.6667

# what values of our test statistic make us lean toward the null?
# high values (to the RIGHT of our graph)


# what values of our test statistc make us lean toward the altnerative?
#low (to the LEFT of our graph)



In [None]:
np.average(sample_averages)

In [None]:
sum(sample_averages <= observed_average)

In [None]:
sum(sample_averages <= observed_average) / 50000

In [None]:
averages_tbl

In [None]:
# 5% of 50,000 = 2500

five_percent_point = averages_tbl.sort(0).column(0).item(2500)
five_percent_point

In [None]:
averages_tbl.hist(bins = 20)
plots.plot([five_percent_point, five_percent_point], [0, 0.35], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');