In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
from rpy2.robjects.packages import importr
Rstats = importr('stats')

In [3]:
counts = pd.read_csv('status-alddist-counts.csv', index_col=0)
counts = counts[['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0', '12.0', '13.0', '14.0']]
counts = counts.transpose()
counts = counts.iloc[:,1:4] # only use the 3 biggest bc significant
counts.head()

Status,Assignment Completed,Service in Progress,Advised
0.0,2856,2298,839
1.0,5115,4479,1971
2.0,3943,1634,732
3.0,8422,5144,1248
4.0,6759,6158,2363


In [4]:
# convert the table to percents out of 100
pcents = counts.copy()
for idx, row in pcents.iterrows():
    total = row.sum()
    for col in pcents.columns:
        pcents.loc[idx, col] = 100*pcents.loc[idx, col] / total
pcents

Status,Assignment Completed,Service in Progress,Advised
0.0,47.655598,38.344736,13.999666
1.0,44.228275,38.728923,17.042802
2.0,62.498019,25.899509,11.602473
3.0,56.851627,34.72391,8.424463
4.0,44.234293,40.301047,15.46466
5.0,35.389726,39.423419,25.186855
6.0,52.739362,39.281915,7.978723
7.0,55.451128,27.130326,17.418546
8.0,47.136957,42.744726,10.118317
9.0,38.188254,50.891453,10.920294


In [5]:
# run a chi squared test on the entire table
chi2 = stats.chi2_contingency(pcents)
chi2[1]

0.0003247578985153647

In [6]:
# confirming the correct amount of data
chi2[3].shape

(15, 3)

In [7]:
# confirm all cells are >= 5
expected = chi2[3]
len(expected[expected < 5])

0

In [8]:
# run a one-way chi squared on each district
pvals = {}
for idx, row in pcents.iterrows():
    pvals[idx] = stats.chisquare(row).pvalue
pvals

{'0.0': 0.00011616702066361919,
 '1.0': 0.0020336896448768294,
 '2.0': 1.0535884581236781e-09,
 '3.0': 2.1995354896228095e-08,
 '4.0': 0.0006755104786539918,
 '5.0': 0.19884192856254684,
 '6.0': 1.343876820753753e-07,
 '7.0': 8.17590869942372e-06,
 '8.0': 4.686755171944214e-06,
 '9.0': 3.678401561091171e-06,
 '10.0': 1.3712346062492844e-05,
 '11.0': 0.0001383410372096823,
 '12.0': 5.543045399040552e-10,
 '13.0': 0.009311772016755421,
 '14.0': 2.3677812853322705e-08}

In [11]:
def std_from_mean(data):
    mean = sum(data)/len(data)
    
    total = 0
    for value in data:
        total += (value - mean)**2
    std = ((1/len(data)*total))**(1/2)
    
    col = [0] * len(data)
    for i, value in enumerate(data):
        difference_from_mean = abs(value - mean)
        std_from_mean = round(difference_from_mean/std, 3)
        col[i] = std_from_mean
        #print('Ald Dist '+str(i+1)+': '+str(std_from_mean)+' standard deviations from mean')
    return col

In [13]:
sdevs = {}
for col in pcents.columns:
    sdcol = std_from_mean(pcents[col].tolist())
    sdevs[col] = sdcol
df = pd.DataFrame(data=sdevs)
df

Unnamed: 0,Assignment Completed,Service in Progress,Advised
1,0.294,0.303,0.072
2,0.746,0.365,0.678
3,1.66,1.71,0.406
4,0.916,0.282,1.039
5,0.745,0.62,0.364
6,1.91,0.478,2.301
7,0.375,0.455,1.128
8,0.732,1.511,0.753
9,0.363,1.015,0.701
10,1.541,2.333,0.541
