In [None]:
from scipy.stats import zscore

In [None]:
def calc_non_param_ci(x1, x2, alpha=0.05):
	'''Calc confidence interval for 2 group median test
	Process:
	* Find all pairwise diffs
	* Sort diffs
	* Find appropriate value of k
	* Choose lower bound from diffs as: diffs[k]
	* Choose upper bound from diffs as: diffs[-k]
	Based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2545906/

	:param x1: sample 1
	:param x2: sample 2
	:param alpha: significance level
	:return: (tuple) confidence interval bounds
	'''

	x1 = np.array(x1)
	x2 = np.array(x2)

	n1 = x1.size
	n2 = x2.size
	cv = stats.norm.ppf(1 - alpha / 2)

	# Find pairwise differences for every datapoint in each group
	diffs = (x1[:, None] - x2).flatten()
	diffs.sort()

	# For an approximate (1-a)% confidence interval first calculate K:
	k = int(round(n1 * n2 / 2 - (cv * (n1 * n2 * (n1 + n2 + 1) / 12) ** 0.5)))

	# The Kth smallest to the Kth largest of the n x m differences
	# n1 and n2 should be > ~20
	ci_lo = diffs[k]
	ci_hi = diffs[-k]

	return ci_lo, ci_hi

In [None]:
def cles_ind(x1, x2):
	'''Calc common language effect size
	Interpret as the probability that a score sampled
	at random from one distribution will be greater than
	a score sampled from some other distribution.
	Based on: http://psycnet.apa.org/doi/10.1037/0033-2909.111.2.361
	:param x1: sample 1
	:param x2: sample 2
	:return: (float) common language effect size
	'''
	x1 = np.array(x1)
	x2 = np.array(x2)
	diff = x1[:, None] - x2
	cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
	return cles

In [None]:
from tqdm.notebook import tqdm as status

In [None]:
def get_95_ci(x1, x2):
	#  Calculate a 95% CI for 2 1d numpy arrays
	signal = x1.mean() - x2.mean()
	noise = np.sqrt(x1.var() / x1.size + x2.var() / x2.size)
	ci_lo = signal - 1.96 * noise
	ci_hi = signal + 1.96 * noise
	return ci_lo, ci_hi

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
%reload_ext nb_black
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'kickstarterprojects'
conn_str = f'postgresql://{postgres_user}:{postgres_pw}@{postgres_host}:{postgres_port}/{postgres_db}'
query = '''
SELECT *
FROM youtube
'''
youtube_df = pd.read_sql_query(query, con=conn_str)

In [None]:
%reload_ext nb_black
import pandas as pd
from scipy import stats
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from scipy.stats import f_oneway as anova
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd as tukey

In [None]:
from scipy import stats

In [None]:
%reload_ext nb_black
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
%reload_ext nb_black
from matplotlib import pyplot as plt
import plotly
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline

In [None]:
from statsmodels.graphics.gofplots import qqplot

In [None]:
%reload_ext nb_black
import pandas as pd
import numpy as np

Data & Imports

In [1]:
%reload_ext nb_black
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import plotly
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

<IPython.core.display.Javascript object>

In [2]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "useducation"
conn_str = f"postgresql://{postgres_user}:{postgres_pw}@{postgres_host}:{postgres_port}/{postgres_db}"

query = """
SELECT *
FROM useducation
"""

df = pd.read_sql_query(query, con=conn_str)
df.head()

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,57948.0,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,
1,1992_ALASKA,ALASKA,1992,,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,9748.0,8789.0,6714.0,79117.0,30847.0,112335.0,,,,258.859712
2,1992_ARIZONA,ARIZONA,1992,,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,55433.0,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,34632.0,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,418418.0,363296.0,270675.0,3286034.0,1372011.0,4717112.0,208.398961,260.892247,196.764414,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Determine all the variable types and find the fraction of the missing values for each variable.

In [3]:
# variable types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1492 entries, 0 to 1491
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PRIMARY_KEY                   1492 non-null   object 
 1   STATE                         1492 non-null   object 
 2   YEAR                          1492 non-null   int64  
 3   ENROLL                        1229 non-null   float64
 4   TOTAL_REVENUE                 1280 non-null   float64
 5   FEDERAL_REVENUE               1280 non-null   float64
 6   STATE_REVENUE                 1280 non-null   float64
 7   LOCAL_REVENUE                 1280 non-null   float64
 8   TOTAL_EXPENDITURE             1280 non-null   float64
 9   INSTRUCTION_EXPENDITURE       1280 non-null   float64
 10  SUPPORT_SERVICES_EXPENDITURE  1280 non-null   float64
 11  OTHER_EXPENDITURE             1229 non-null   float64
 12  CAPITAL_OUTLAY_EXPENDITURE    1280 non-null   float64
 13  GRA

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# fraction of null values for each variable
df.isna().mean().sort_values(ascending=False)

AVG_READING_8_SCORE             0.666220
AVG_MATH_8_SCORE                0.643432
AVG_READING_4_SCORE             0.642761
AVG_MATH_4_SCORE                0.640751
OTHER_EXPENDITURE               0.176273
ENROLL                          0.176273
TOTAL_REVENUE                   0.142091
FEDERAL_REVENUE                 0.142091
STATE_REVENUE                   0.142091
LOCAL_REVENUE                   0.142091
TOTAL_EXPENDITURE               0.142091
INSTRUCTION_EXPENDITURE         0.142091
SUPPORT_SERVICES_EXPENDITURE    0.142091
CAPITAL_OUTLAY_EXPENDITURE      0.142091
GRADES_PK_G                     0.115952
GRADES_ALL_G                    0.115952
GRADES_KG_G                     0.088472
GRADES_4_G                      0.087802
GRADES_8_G                      0.087802
GRADES_12_G                     0.087802
GRADES_1_8_G                    0.087802
GRADES_9_12_G                   0.087802
YEAR                            0.000000
STATE                           0.000000
PRIMARY_KEY     

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

2. Notice that the data has a time dimension (year). For this assignment, forget about time and treat all the observations as if they're from the same year. Choose a strategy to deal with the missing values for each variable. For which variables would filling in the missing values with some value make sense? For which might tossing out the records entirely make sense?

In [5]:
columns = [
    "AVG_MATH_4_SCORE",
    "AVG_MATH_8_SCORE",
    "AVG_READING_4_SCORE",
    "AVG_READING_8_SCORE",
]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
df.dropna(inplace=True, subset=columns)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# df.isna().sum()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
df.describe()

Unnamed: 0,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
count,474.0,416.0,423.0,423.0,423.0,423.0,423.0,423.0,423.0,416.0,...,423.0,423.0,423.0,423.0,423.0,420.0,474.0,474.0,474.0,474.0
mean,2008.305907,941483.4,10362500.0,931493.2,4796499.0,4634505.0,10467290.0,5409149.0,3059024.0,479449.0,...,71800.108747,72316.938534,63612.397163,578489.4,283253.3,874811.8,236.541353,279.511266,219.305086,263.772645
std,6.096588,1120275.0,13002480.0,1317133.0,6114588.0,6043418.0,13243640.0,6945857.0,3697200.0,585606.8,...,85693.801644,85301.129298,76249.079764,687973.8,337609.0,1026837.0,9.159956,9.695064,7.698038,6.834157
min,1992.0,44179.0,645233.0,41007.0,0.0,28531.0,638784.0,318260.0,194915.0,21910.0,...,3423.0,2860.0,2188.0,26762.0,9970.0,36823.0,187.13467,232.83151,178.557612,236.379102
25%,2005.0,278124.8,2638304.0,248242.0,1367580.0,925137.5,2602185.0,1395808.0,789404.5,127727.8,...,20449.0,20806.0,18508.5,166115.5,81690.0,259372.0,232.062602,274.436276,215.044071,259.762432
50%,2009.0,663941.5,5991731.0,534510.0,2945175.0,2335986.0,6024747.0,3053380.0,1814927.0,323398.0,...,50408.0,49723.0,40927.0,405017.0,184303.0,623118.0,238.289254,281.369105,220.928176,265.136
75%,2013.0,1015914.0,11679610.0,1009335.0,5905138.0,5153868.0,11411560.0,5947704.0,3526303.0,547073.5,...,79003.5,80846.0,74640.5,638244.0,321060.0,968510.5,242.557845,286.154375,224.488094,268.275364
max,2017.0,6307022.0,78248040.0,9990221.0,42360470.0,34941510.0,78365960.0,41954260.0,23501850.0,3759373.0,...,493415.0,500143.0,498403.0,3929869.0,2011865.0,5926631.0,253.420961,300.568235,236.773867,280.49913


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Now, take into account the time factor. Replicate your second answer, but this time fill in the missing values by using a statistic that is calculated within the year of the observation. For example, if you want to fill in a missing value for a variable with the mean of that variable, calculate the mean by using only the observations for that specific year.

In [9]:
df2 = pd.read_sql_query(query, con=conn_str)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
columns = [
    "OTHER_EXPENDITURE",
    "ENROLL",
    "TOTAL_REVENUE",
    "FEDERAL_REVENUE",
    "STATE_REVENUE",
    "LOCAL_REVENUE",
    "TOTAL_EXPENDITURE",
    "INSTRUCTION_EXPENDITURE",
    "SUPPORT_SERVICES_EXPENDITURE",
    "CAPITAL_OUTLAY_EXPENDITURE",
    "GRADES_PK_G",
    "GRADES_ALL_G",
    "GRADES_KG_G",
    "GRADES_4_G",
    "GRADES_8_G",
    "GRADES_12_G",
    "GRADES_1_8_G",
    "GRADES_9_12_G",
]

yrs = df2.YEAR.unique().tolist()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
for year in yrs:
    for column in columns:
        year_mean = df2[df2.YEAR == year][column].mean()
        df2.loc[df2.YEAR == year, column] = df2[df2.YEAR == year][column].fillna(year_mean)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
df2.describe()

Unnamed: 0,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
count,1492.0,1385.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1385.0,...,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,536.0,532.0,533.0,498.0
mean,2004.433646,916165.4,9084108.0,766274.9,4213597.0,4104236.0,9190557.0,4758243.0,2677770.0,428650.7,...,64590.152441,64331.744106,54418.075582,519620.0,247493.2,802917.8,234.768293,278.414711,218.866154,263.661132
std,7.393983,1003532.0,11102040.0,1085057.0,5237195.0,5181475.0,11320840.0,5947042.0,3170760.0,504101.3,...,79451.944548,78083.976028,67187.337112,636893.2,304494.3,945616.2,10.221511,10.159722,7.769616,6.803725
min,1992.0,43866.0,465650.0,31020.0,0.0,22093.0,481665.0,265549.0,139963.0,11541.0,...,633.0,437.0,311.0,4878.0,1808.0,7254.0,187.13467,232.83151,178.557612,236.379102
25%,1998.0,286759.0,2425600.0,206853.0,1319582.0,874706.0,2427316.0,1306105.0,735036.0,119934.0,...,14190.5,13698.25,11161.75,112971.2,52703.0,193978.5,229.694352,272.761598,214.663401,259.533171
50%,2004.0,737401.0,5778507.0,446180.0,2857497.0,2398344.0,5858385.0,3033016.0,1720824.0,307022.0,...,44909.5,44756.5,37541.5,361170.5,170652.5,598392.0,237.238552,280.618803,220.416034,265.010912
75%,2011.0,962488.0,11168630.0,895642.0,5187230.0,4995971.0,11169700.0,5672620.0,3333043.0,522347.5,...,75149.75,75016.25,67026.0,605224.2,288852.2,904556.2,241.995486,285.347428,223.999337,268.190121
max,2017.0,6307022.0,89217260.0,9990221.0,50904570.0,36105260.0,85320130.0,43964520.0,26058020.0,3995951.0,...,493415.0,500143.0,498403.0,3929869.0,2013687.0,5944746.0,253.420961,300.568235,236.773867,280.49913


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# df.isna().sum()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# This time, fill in the missing values using interpolation (extrapolation).

In [14]:
df3 = pd.read_sql_query(query, con=conn_str)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
for year in yrs:
    for column in columns:
        df3.loc[df3.YEAR == year, column] = df3[df3.YEAR == year][column].interpolate()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
df3.describe()

Unnamed: 0,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
count,1492.0,1385.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1385.0,...,1390.0,1390.0,1390.0,1390.0,1390.0,1389.0,536.0,532.0,533.0,498.0
mean,2004.433646,827549.2,8245375.0,692303.0,3833809.0,3719263.0,8336934.0,4315352.0,2431959.0,386220.3,...,63313.416547,63044.577698,53226.568345,509347.7,242336.3,776736.2,234.768293,278.414711,218.866154,263.661132
std,7.393983,1035312.0,11336870.0,1099649.0,5342626.0,5287804.0,11559750.0,6070526.0,3241094.0,517680.3,...,79901.750909,78541.805164,67560.332598,640531.5,306225.9,954382.1,10.221511,10.159722,7.769616,6.803725
min,1992.0,43866.0,465650.0,31020.0,0.0,22093.0,481665.0,265549.0,139963.0,11541.0,...,633.0,437.0,311.0,4878.0,1808.0,7254.0,187.13467,232.83151,178.557612,236.379102
25%,1998.0,149801.0,1684706.0,140242.0,880916.0,622991.0,1675477.0,846872.0,504838.0,64896.0,...,12412.5,12529.5,10628.25,98787.25,48534.25,171220.0,229.694352,272.761598,214.663401,259.533171
50%,2004.0,532433.0,4330089.0,350797.0,2259753.0,1660527.0,4363308.0,2236724.0,1257432.0,229847.0,...,42089.5,42805.5,35979.5,339606.5,160144.0,526514.0,237.238552,280.618803,220.416034,265.010912
75%,2011.0,962488.0,9670620.0,773014.0,4517924.0,4108728.0,9779642.0,5148806.0,2865861.0,479352.0,...,75149.75,75016.25,67026.0,605224.2,288852.2,909280.0,241.995486,285.347428,223.999337,268.190121
max,2017.0,6307022.0,89217260.0,9990221.0,50904570.0,36105260.0,85320130.0,43964520.0,26058020.0,3995951.0,...,493415.0,500143.0,498403.0,3929869.0,2013687.0,5944746.0,253.420961,300.568235,236.773867,280.49913


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
# df.isna().sum()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Compare your results for the second, third, and fourth questions. Do you find any meaningful differences?

In [20]:
df_compare = pd.concat([df.describe(), df2.describe(), df3.describe()])
df_compare

Unnamed: 0,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
count,474.0,416.0,423.0,423.0,423.0,423.0,423.0,423.0,423.0,416.0,...,423.0,423.0,423.0,423.0,423.0,420.0,474.0,474.0,474.0,474.0
mean,2008.305907,941483.4,10362500.0,931493.2,4796499.0,4634505.0,10467290.0,5409149.0,3059024.0,479449.0,...,71800.108747,72316.938534,63612.397163,578489.4,283253.3,874811.8,236.541353,279.511266,219.305086,263.772645
std,6.096588,1120275.0,13002480.0,1317133.0,6114588.0,6043418.0,13243640.0,6945857.0,3697200.0,585606.8,...,85693.801644,85301.129298,76249.079764,687973.8,337609.0,1026837.0,9.159956,9.695064,7.698038,6.834157
min,1992.0,44179.0,645233.0,41007.0,0.0,28531.0,638784.0,318260.0,194915.0,21910.0,...,3423.0,2860.0,2188.0,26762.0,9970.0,36823.0,187.13467,232.83151,178.557612,236.379102
25%,2005.0,278124.8,2638304.0,248242.0,1367580.0,925137.5,2602185.0,1395808.0,789404.5,127727.8,...,20449.0,20806.0,18508.5,166115.5,81690.0,259372.0,232.062602,274.436276,215.044071,259.762432
50%,2009.0,663941.5,5991731.0,534510.0,2945175.0,2335986.0,6024747.0,3053380.0,1814927.0,323398.0,...,50408.0,49723.0,40927.0,405017.0,184303.0,623118.0,238.289254,281.369105,220.928176,265.136
75%,2013.0,1015914.0,11679610.0,1009335.0,5905138.0,5153868.0,11411560.0,5947704.0,3526303.0,547073.5,...,79003.5,80846.0,74640.5,638244.0,321060.0,968510.5,242.557845,286.154375,224.488094,268.275364
max,2017.0,6307022.0,78248040.0,9990221.0,42360470.0,34941510.0,78365960.0,41954260.0,23501850.0,3759373.0,...,493415.0,500143.0,498403.0,3929869.0,2011865.0,5926631.0,253.420961,300.568235,236.773867,280.49913
count,1492.0,1385.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1441.0,1385.0,...,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,536.0,532.0,533.0,498.0
mean,2004.433646,916165.4,9084108.0,766274.9,4213597.0,4104236.0,9190557.0,4758243.0,2677770.0,428650.7,...,64590.152441,64331.744106,54418.075582,519620.0,247493.2,802917.8,234.768293,278.414711,218.866154,263.661132


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
# differences in means of the dataframes
# the means of the 2nd and 3rd df were raised and lowered respectively 
pd.concat([df.mean(), df2.mean(), df3.mean()], axis=1)

Unnamed: 0,0,1,2
YEAR,2008.306,2004.434,2004.434
ENROLL,941483.4,916165.4,827549.2
TOTAL_REVENUE,10362500.0,9084108.0,8245375.0
FEDERAL_REVENUE,931493.2,766274.9,692303.0
STATE_REVENUE,4796499.0,4213597.0,3833809.0
LOCAL_REVENUE,4634505.0,4104236.0,3719263.0
TOTAL_EXPENDITURE,10467290.0,9190557.0,8336934.0
INSTRUCTION_EXPENDITURE,5409149.0,4758243.0,4315352.0
SUPPORT_SERVICES_EXPENDITURE,3059024.0,2677770.0,2431959.0
OTHER_EXPENDITURE,479449.0,428650.7,386220.3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
# differences in medians
# the medians of the 2nd and 3rd df were raised and lowered respectively 
pd.concat([df.median(), df2.median(), df3.median()], axis=1)

Unnamed: 0,0,1,2
YEAR,2009.0,2004.0,2004.0
ENROLL,663941.5,737401.0,532433.0
TOTAL_REVENUE,5991731.0,5778507.0,4330089.0
FEDERAL_REVENUE,534510.0,446180.0,350797.0
STATE_REVENUE,2945175.0,2857497.0,2259753.0
LOCAL_REVENUE,2335986.0,2398344.0,1660527.0
TOTAL_EXPENDITURE,6024747.0,5858385.0,4363308.0
INSTRUCTION_EXPENDITURE,3053380.0,3033016.0,2236724.0
SUPPORT_SERVICES_EXPENDITURE,1814927.0,1720824.0,1257432.0
OTHER_EXPENDITURE,323398.0,307022.0,229847.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
# stds of each dataframe
# the first df seems to have decently higher deviations than the 2nd two 
pd.concat([df.std(), df2.std(), df3.std()], axis=1)

Unnamed: 0,0,1,2
YEAR,6.096588,7.393983,7.393983
ENROLL,1120275.0,1003532.0,1035312.0
TOTAL_REVENUE,13002480.0,11102040.0,11336870.0
FEDERAL_REVENUE,1317133.0,1085057.0,1099649.0
STATE_REVENUE,6114588.0,5237195.0,5342626.0
LOCAL_REVENUE,6043418.0,5181475.0,5287804.0
TOTAL_EXPENDITURE,13243640.0,11320840.0,11559750.0
INSTRUCTION_EXPENDITURE,6945857.0,5947042.0,6070526.0
SUPPORT_SERVICES_EXPENDITURE,3697200.0,3170760.0,3241094.0
OTHER_EXPENDITURE,585606.8,504101.3,517680.3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>