In [4]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(
    "../data/sat-scores.csv",
    usecols=[
        "Year",
        "State.Code",
        "Total.Math",
        "Family Income.Less than 20k.Math",
        "Family Income.Between 20-40k.Math",
        "Family Income.Between 40-60k.Math",
        "Family Income.Between 60-80k.Math",
        "Family Income.Between 80-100k.Math",
        "Family Income.More than 100k.Math",
    ],
)
df = df.rename(
    columns={
        "State.Code": "State",
        "Family Income.Less than 20k.Math": "Income 0-20k",
        "Family Income.Between 20-40k.Math": "Income 20-40k",
        "Family Income.Between 40-60k.Math": "Income 40-60k",
        "Family Income.Between 60-80k.Math": "Income 60-80k",
        "Family Income.Between 80-100k.Math": "Income 80-100k",
        "Family Income.More than 100k.Math": "Income >100k",
    }
)
df.head()

Unnamed: 0,Year,State,Total.Math,Income 20-40k,Income 40-60k,Income 60-80k,Income 80-100k,Income 0-20k,Income >100k
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


1. Find the average SAT math score for each income level, grouped and then sorted by year
2. For each year in the dataset, determine how much better each income group did, on average, than the next poorer group of students.  
Do you see (just by looking at the data) any income group that did worse, in any year, than the next poorer students?
3. Which income backet, on average, had the greatest advantage over the next poorer income bracket?
4. Can we find, in a calculated and automated way, which income levels consistently (i.e. across all years) do worse than the next-poorest group?

In [47]:
# 1. average SAT math score for each income level, grouped and sorted by year
yearly_sats = df.groupby("Year")[
    [
        # "Total.Math",
        "Income 0-20k",
        "Income 20-40k",
        "Income 40-60k",
        "Income 60-80k",
        "Income 80-100k",
        "Income >100k",
    ]
].mean()
yearly_sats

Unnamed: 0_level_0,Income 0-20k,Income 20-40k,Income 40-60k,Income 60-80k,Income 80-100k,Income >100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2005,427.596154,488.653846,522.673077,536.076923,548.942308,572.173077
2006,461.019231,502.923077,523.769231,534.903846,550.461538,572.519231
2007,457.924528,494.849057,519.490566,533.188679,545.698113,565.169811
2008,478.641509,523.622642,547.471698,549.188679,557.641509,564.566038
2009,482.058824,527.823529,550.980392,553.941176,565.333333,585.784314
2010,477.039216,499.27451,522.0,534.235294,547.627451,569.27451
2011,460.45283,494.886792,513.415094,528.660377,541.849057,563.245283
2012,458.773585,492.056604,512.45283,525.773585,538.301887,557.320755
2013,469.358491,490.132075,511.377358,520.320755,537.396226,556.339623
2014,459.415094,497.641509,514.943396,527.169811,543.132075,555.433962


In [48]:
# book version:
df.groupby("Year").mean(numeric_only=True).sort_index()
# the numeric only part makes this easier than what I did, but includes the Total.Math field, which
# complicates the next step (at least, the way I did it)

Unnamed: 0_level_0,Total.Math,Income 20-40k,Income 40-60k,Income 60-80k,Income 80-100k,Income 0-20k,Income >100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005,535.653846,488.653846,522.673077,536.076923,548.942308,427.596154,572.173077
2006,537.480769,502.923077,523.769231,534.903846,550.461538,461.019231,572.519231
2007,535.339623,494.849057,519.490566,533.188679,545.698113,457.924528,565.169811
2008,535.981132,523.622642,547.471698,549.188679,557.641509,478.641509,564.566038
2009,540.803922,527.823529,550.980392,553.941176,565.333333,482.058824,585.784314
2010,540.843137,499.27451,522.0,534.235294,547.627451,477.039216,569.27451
2011,533.226415,494.886792,513.415094,528.660377,541.849057,460.45283,563.245283
2012,533.603774,492.056604,512.45283,525.773585,538.301887,458.773585,557.320755
2013,532.622642,490.132075,511.377358,520.320755,537.396226,469.358491,556.339623
2014,534.283019,497.641509,514.943396,527.169811,543.132075,459.415094,555.433962


In [49]:
# 2. For each year in the dataset determine how much better each income group did, on average than the next poorer group of students.
yearly_sats.T.pct_change() * 100

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Income 0-20k,,,,,,,,,,,
Income 20-40k,14.279289,9.089392,8.063453,9.397666,9.493594,4.661104,7.478282,7.254781,4.425953,8.32067,9.857908
Income 40-60k,6.961826,4.144998,4.979601,4.554627,4.387236,4.551702,3.743948,4.145098,4.334604,3.476777,4.505853
Income 60-80k,2.56448,2.125863,2.636836,0.31362,0.537367,2.343926,2.969387,2.599411,1.748884,2.374322,2.603841
Income 80-100k,2.399914,2.908503,2.346155,1.539149,2.056564,2.50679,2.494736,2.382832,3.28172,3.027917,2.82769
Income >100k,4.231915,4.007127,3.568218,1.241753,3.617508,3.952881,3.948743,3.533123,3.525033,2.26499,3.947368


In [81]:
# 3. Which income bracket had the greatest advantage over the previous incoming bracket?
advantage_numerical = (
    yearly_sats.T.sort_index()
    .pct_change()  # previous call calculating rolling change year by year
    .T.mean()  # re-transpose the data so we're now looking at columns of overall advantage
    .dropna()  # get rid of the lowest income bracket of NaNs
    .sort_values(ascending=False)  # show top-to-bottom advantage amounts
)
advantage_numerical
# this shows the 20-40k bracket has the best comparative advantage of the brackets
# at 8.3%


Income 20-40k     0.083929
Income 40-60k     0.045260
Income >100k      0.034399
Income 80-100k    0.025247
Income 60-80k     0.020744
dtype: float64

In [53]:
# book: the other way to do this without transposing is to change the axis in the mean
# call
yearly_sats.T.pct_change().mean(axis="columns")

Income 0-20k           NaN
Income 20-40k     0.083929
Income 40-60k     0.045260
Income 60-80k     0.020744
Income 80-100k    0.025247
Income >100k      0.034399
dtype: float64

In [54]:
# 4. Can we show which group consistently performs more poorly than the next lowest bracket?
agg_change = (
    yearly_sats.T.pct_change()  # previous call calculating rolling change year by year
    .T.mean()  # re-transpose the data so we're now looking at columns of overall advantage
    .dropna()  # get rid of the lowest income bracket of NaNs
)
agg_change[agg_change < 0]

Series([], dtype: float64)

# Extension questions
1. Calculate descriptive statistics for all the changes in income brackets. Where do you see the largest difference between income brackets?
2. Which five states have the greatest SAT gap in Math scores between the richest and poorest students?
3. Perform the same analysis as the Math scores on Verbal scores. Do wealthier students generally still do better than poorer students? Do any income brackets do worse than the next poorer bracket?

In [62]:
# 1. descriptive statistics on changes in income brackets
yearly_sats.T.pct_change().T.describe()
# looking at the mean, the 20-40k bracket has the biggest improvement over the previous bracket year over year
# but, the standard deviation is also quite high compared to other years
# it also saw the highest improvement year on year at 14.3%

Unnamed: 0,Income 0-20k,Income 20-40k,Income 40-60k,Income 60-80k,Income 80-100k,Income >100k
count,0.0,11.0,11.0,11.0,11.0,11.0
mean,,0.083929,0.04526,0.020744,0.025247,0.034399
std,,0.026723,0.009055,0.008745,0.004821,0.008947
min,,0.04426,0.034768,0.003136,0.015391,0.012418
25%,,0.073665,0.04145,0.019374,0.023645,0.035291
50%,,0.083207,0.043872,0.023743,0.024947,0.036175
75%,,0.094456,0.045532,0.026016,0.028681,0.039508
max,,0.142793,0.069618,0.029694,0.032817,0.042319


In [None]:
# 2. Calculate the greatest wealth gap for Math scores based on state averages
df["wealth_gap"] = df["Income >100k"] - df["Income 0-20k"]
df.groupby("State")["wealth_gap"].mean().sort_values(ascending=False).head()

State
ND    341.909091
WY    246.454545
DC    208.818182
SD    157.000000
MS    140.000000
Name: wealth_gap, dtype: float64

In [71]:
# 3. Analysis on verbal scores
dfv = pd.read_csv(
    "../data/sat-scores.csv",
    usecols=[
        "Year",
        "State.Code",
        "Family Income.Less than 20k.Verbal",
        "Family Income.Between 20-40k.Verbal",
        "Family Income.Between 40-60k.Verbal",
        "Family Income.Between 60-80k.Verbal",
        "Family Income.Between 80-100k.Verbal",
        "Family Income.More than 100k.Verbal",
    ],
)
dfv = dfv.rename(
    columns={
        "State.Code": "State",
        "Family Income.Less than 20k.Verbal": "Income 0-20k",
        "Family Income.Between 20-40k.Verbal": "Income 20-40k",
        "Family Income.Between 40-60k.Verbal": "Income 40-60k",
        "Family Income.Between 60-80k.Verbal": "Income 60-80k",
        "Family Income.Between 80-100k.Verbal": "Income 80-100k",
        "Family Income.More than 100k.Verbal": "Income >100k",
    }
)
dfv.head()

Unnamed: 0,Year,State,Income 20-40k,Income 40-60k,Income 60-80k,Income 80-100k,Income 0-20k,Income >100k
0,2005,AL,527,551,564,577,474,590
1,2005,AK,500,522,519,534,467,544
2,2005,AZ,495,518,523,533,474,546
3,2005,AR,526,555,570,580,486,589
4,2005,CA,458,494,511,525,421,551


In [78]:
yearly_verbal = dfv.groupby("Year").mean(numeric_only=True).sort_index()
yearly_verbal.T.sort_index()


Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Income 0-20k,457.826923,452.461538,452.396226,474.830189,476.921569,467.647059,451.90566,450.849057,461.641509,454.811321,442.396226
Income 20-40k,501.096154,502.153846,496.584906,522.924528,526.843137,497.647059,493.207547,491.113208,490.056604,499.603774,494.207547
Income 40-60k,524.961538,523.961538,522.433962,543.698113,547.0,520.568627,514.169811,512.886792,513.018868,515.396226,516.679245
Income 60-80k,536.25,533.326923,533.169811,545.283019,550.019608,532.607843,529.245283,526.226415,522.415094,533.471698,530.886792
Income 80-100k,548.403846,544.865385,542.811321,552.90566,562.568627,545.823529,538.113208,534.320755,536.075472,543.188679,542.132075
Income >100k,567.519231,563.269231,559.981132,557.471698,576.607843,564.392157,559.377358,550.830189,553.490566,552.09434,561.320755


In [80]:
# 3. Which income bracket had the greatest advantage over the previous incoming bracket?
advantage_verbal = (
    yearly_verbal.T.sort_index()
    .pct_change()  # previous call calculating rolling change year by year
    .T.mean()  # re-transpose the data so we're now looking at columns of overall advantage
    .dropna()  # get rid of the lowest income bracket of NaNs
    .sort_values(ascending=False)  # show top-to-bottom advantage amounts
)
advantage_verbal
# wealthier students still have comparative advantage over poorer students


Income 20-40k     0.093635
Income 40-60k     0.043448
Income >100k      0.029290
Income 60-80k     0.020700
Income 80-100k    0.020152
dtype: float64

In [None]:
# book's version of the bracket negative change code
change = (
    dfv.groupby("Year")[
        [
            "Income 0-20k",
            "Income 20-40k",
            "Income 40-60k",
            "Income 60-80k",
            "Income 80-100k",
            "Income >100k",
        ]
    ]
    .mean()
    .sort_index()
    .T.pct_change()
)
change[change <= 0].dropna()
# the book's solution reckons there's a negative change for 80-100k, and running the same code, looks like
# they're wrong

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Income 0-20k,,,,,,,,,,,
Income 20-40k,0.09451,0.109827,0.097677,0.101287,0.104675,0.064151,0.091395,0.089307,0.061552,0.098486,0.117115
Income 40-60k,0.047626,0.043428,0.052054,0.039726,0.03826,0.04606,0.042502,0.044335,0.046856,0.03161,0.04547
Income 60-80k,0.021503,0.017874,0.02055,0.002915,0.00552,0.023127,0.02932,0.026009,0.018316,0.035071,0.027498
Income 80-100k,0.022665,0.021635,0.018083,0.013979,0.022816,0.024813,0.016756,0.015382,0.026149,0.018215,0.021182
Income >100k,0.034856,0.033777,0.031631,0.008258,0.024956,0.034019,0.039516,0.030898,0.032486,0.016395,0.035395


In [None]:
# let's look at how pronounced the advantage is for verbal compared to numerical
math_vs_verbal = pd.DataFrame({"math": advantage_numerical, "verbal": advantage_verbal})
math_vs_verbal

Unnamed: 0,math,verbal
Income 20-40k,0.083929,0.093635
Income 40-60k,0.04526,0.043448
Income 60-80k,0.020744,0.0207
Income 80-100k,0.025247,0.020152
Income >100k,0.034399,0.02929


In [None]:
advantage_numerical - advantage_verbal
# very slight difference in mean achievement between verbal and mathematical
# for the income brackets, so jumping wealth bracket is very slightly less of an advantage for
# math than verbal unless you're really poor

Income 20-40k    -0.009706
Income 40-60k     0.001812
Income 60-80k     0.000043
Income 80-100k    0.005095
Income >100k      0.005109
dtype: float64

In [None]:
# what about for just general performance of different wealth brackets for math vs verbal?
yearly_sats.mean() - yearly_verbal.mean()
# this is a bit more interesting. wealth bracket makes more of a difference to math vs verbal
# for the lowest and highest wealth brackets

Income 0-20k      3.280250
Income 20-40k    -1.088263
Income 40-60k    -0.222397
Income 60-80k    -0.210117
Income 80-100k    2.473880
Income >100k      5.355097
dtype: float64

In [91]:
# what about state based performance ranges for verbal?
dfv["wealth_gap"] = dfv["Income >100k"] - dfv["Income 0-20k"]
dfv.groupby("State")["wealth_gap"].mean().sort_values(ascending=False).head()

State
ND    315.363636
DC    216.090909
WY    215.818182
MD    126.818182
AL    124.181818
Name: wealth_gap, dtype: float64