In [1]:
import pandas as pd
import numpy as np

In [None]:
# some random products over 3 years
g = np.random.default_rng(0)
df = pd.DataFrame(g.integers(0, 100, [36, 3]), columns=list("ABC"))
df["year"] = [2018] * 12 + [2019] * 12 + [2020] * 12
df["month"] = "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec".split() * 3
df.head()

Unnamed: 0,A,B,C,year,month
0,85,63,51,2018,Jan
1,26,30,4,2018,Feb
2,7,1,17,2018,Mar
3,81,64,91,2018,Apr
4,50,60,97,2018,May


In [None]:
# set a multi index on the data allowing it to be index by year, month or both
df = df.set_index(["year", "month"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jan,85,63,51
2018,Feb,26,30,4
2018,Mar,7,1,17
2018,Apr,81,64,91
2018,May,50,60,97


In [None]:
# use a tuple to use both of index fields together
df.loc[(2018, "Jan"), ["A"]]

A    85
Name: (2018, Jan), dtype: int64

In [None]:
# use square bracket notation as normal too
df.loc[[2018, 2020]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jan,85,63,51
2018,Feb,26,30,4
2018,Mar,7,1,17
2018,Apr,81,64,91
2018,May,50,60,97
2018,Jun,72,63,54
2018,Jul,55,93,27
2018,Aug,81,67,0
2018,Sep,39,85,55
2018,Oct,3,76,72


In [None]:
# The first slice means we want all the years
df.loc[(slice(None), ["Jan", "Jun"]), "A":"C"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Jan,85,63,51
2019,Jan,8,29,48
2020,Jan,42,48,71
2018,Jun,72,63,54
2019,Jun,38,46,99
2020,Jun,33,76,39


# SAT scores
1. Read in the scores file using Year, State.Code, Total.Math, Total.Test-takers, Total.Verbal fields
2. Multi-index based off the year and state code
3. How many people took the SAT in 2005?
4. What is the average SAT math scores in 2010 from NY, NJ, MA, and IL?
5. What is the average SAT verbal score from 2012-2015 from AZ, CA, and TX?

In [None]:
df = pd.read_csv(
    "../data/sat-scores.csv",
    usecols=["Year", "State.Code", "Total.Math", "Total.Test-takers", "Total.Verbal"],
    index_col=["Year", "State.Code"],
)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total.Math,Total.Test-takers,Total.Verbal
Year,State.Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005,AL,559,3985,567
2005,AK,519,3996,523
2005,AZ,530,18184,526
2005,AR,552,1600,563
2005,CA,522,186552,504


In [None]:
# How many test takers were there in 2005?
df.loc[2005, "Total.Test-takers"].sum()

np.int64(1344824)

In [19]:
# What is the average SAT Math scores in 2010 from NY, NJ, MA, and IL?
df.loc[(2010, ["NY", "NJ", "MA", "IL"]), "Total.Math"].mean()

np.float64(535.25)

In [20]:
# Averate verbal score from 2012-2015 in AZ, CA, TX
df.loc[([2012, 2013, 2014, 2015], ["AZ", "CA", "TX"]), "Total.Verbal"].mean()

np.float64(497.3333333333333)

# Extension questions
1. What were the average math and verbal scores from Florida, Indiana, and Idaho from all these years? (Don't break out the values by state)
2. Which state received the highest verbal score, and in which year?
3. Was the average math score in 2005 higher or lower than in 2015?

In [21]:
df.loc[(slice(None), ["FL", "IN", "ID"]), ["Total.Math", "Total.Verbal"]].mean()

Total.Math      507.090909
Total.Verbal    504.606061
dtype: float64

In [24]:
df.loc[df["Total.Verbal"] == df["Total.Verbal"].max()]
# from the book...
# ... but we can also use idxmax to get the index of the highest score
df["Total.Verbal"].idxmax()

(np.int64(2013), 'ND')

In [None]:
df.loc[2005, "Total.Math"].mean() > df.loc[2015, "Total.Math"].mean()
# the book subtracts the means to arrive at the same conclusion:
# df.loc[2005, 'Total.Math'].mean() - df.loc[2015, 'Total.Math'].mean()

np.True_