This script takes Texas Education Agency data about school district demographics and disciplinary actions, and puts them together in one GeoJSON file for the Texas Appleseed "School to Prison Pipeline" map. See http://www.texasdisciplinelab.org/

To use the script, follow these instructions:

1. For every year that you want to cover, download all 20 of the region files from http://rptsvr1.tea.texas.gov/adhocrpt/Disciplinary_Data_Products/Download_Region_Districts.html and put them in the directory '../data/from_agency/by_region/'

2. For every year that you want to cover, download the "District and Charter Detail Data" Snapshot Data File (comma-delimited *.dat)" from https://rptsvr1.tea.texas.gov/perfreport/snapshot/download.html. The website automatically delivers these files with the same filename: district.dat. You will need to rename them to have different names by adding the year after "district". For instance, "district2016.dat"

3. This script needs a GeoJSON file of district shapes. Make sure it can find that file at '../geojson/base_districts.geojson'

4. Change the first_year and last_year variables below to reflect the years you want your file to cover.

5. Run the notebook with "Kernel -> Restart and Run All"

6. Wait a while for it to finish. 4:21

In [23]:
import pandas as pd
import numpy as np
import scipy.stats as stats

pd.options.display.max_columns = 999

first_year = 2006 # the year 2006 is the first year on the TEA site
last_year = 2016


def formatDF(apple, year_col):
    
    # Removes rows and columns not needed for the map
    
    apple = apple.drop(["AGGREGATION LEVEL","REGION","DISTNAME"], axis = 1)
    
    # Adding a totals row by adding up actions against special ed students and non-special ed students
    
    non_special = {"D06": ("D05","D-EXPULSION ACTIONS"), 
                   "D09": ("D08","E-DAEP PLACEMENTS"), 
                   "D12": ("D11", "F-OUT OF SCHOOL SUSPENSIONS"), 
                   "D15": ("D14", "G-IN SCHOOL SUSPENSIONS")}
    
    # This will be inefficient because it makes a dict list first instead of staying in pandas
    
    all_actions = []
    
    # if it was a .csv, the headers would be ["DISTRICT", "SECTION", "HEADING", "HEADING NAME", year_col]
    
    unfound = []
    
    for d in apple.index.get_level_values(0).unique():
        for key in non_special:
            try: 
                a = apple.loc[(d, key)][year_col]
                b = apple.loc[(d, non_special[key][0])][year_col]
            except KeyError:
                unfound.append((d,key))
                # print("Failed to find {} {}.".format(d, key))
            if a < 0: # in case of dummy values like -999
                a = 1
            if b < 0:
                b = 1
            total = a + b
            all_actions.append({"DISTRICT": d, "HEADING": key, "SECTION": non_special[key][1], 
                                "HEADING NAME": "ALL", year_col: total})
    
    # print(all_actions[:10])
    new = pd.DataFrame(all_actions)
    new = new.set_index(["DISTRICT", "HEADING"])

    # Keeping only the rows that categorize students by protected class.
    patternIn = 'WHITE|BLACK OR AFRICAN AMERICAN|AMERICAN INDIAN OR ALASKA NAT|HISPANIC|NATIVE HAWAIIAN|ASIAN|TWO OR MORE RACES|SPEC. ED|ECO. DISAD|ECO DISAD.|TOTAL'
    apple = apple[apple["HEADING NAME"].str.contains(patternIn)]
    
    # Getting rid of rows that count students instead of incidents, or non-disadvantaged kids.
    patternOut = 'SPEC. ED. STUDENTS| SPEC. ED. EXPULSIONS TO JJAEP|ECO DISAD. STUDENTS|ECO. DISAD. STUDENTS|AT RISK|NON AT|UNKNOWN AT|NON SPEC. ED.|NON ECO DISAD.|NON ECO. DISAD.'
    apple = apple[apple["HEADING NAME"].str.contains(patternOut) == False]

    # Delete rows appearing to double-count the same expulsions.
    JJAEPReplace = {"SECTION": {
                        'M-ECO\. DISADV\. JJAEP PLACEMENTS|H-SPEC\. ED\. JJAEP EXPULSIONS': 'C-JJAEP EXPULSIONS'}}
    apple = apple.replace(to_replace=JJAEPReplace, regex=True)
    apple = apple[apple["SECTION"].str.contains("JJAEP EXPULSIONS|DISCIPLINE ACTION COUNTS") == False]
                    
    # apple = apple.reset_index
    apple = apple.append(new)
    
    # Consolidating some of the descriptors into broader categories
    appleReplace = {year_col:
                        {-99999999: 1, -999999: 1, -999: 1},
                    "SECTION": {
                        'D-EXPULSION ACTIONS|N-ECO\. DISADV\. EXPULSIONS|I-SPEC\. ED\. EXPULSIONS': 'EXP',
                        'E-DAEP PLACEMENTS|O-ECO\. DISADV\. DAEP PLACEMENTS|J-SPEC\. ED\. DAEP PLACEMENTS': 'DAE',
                        'F-OUT OF SCHOOL SUSPENSIONS|P-ECO\. DISADV\. OUT OF SCHOOL SUS.|K-SPEC\. ED\. OUT OF SCHOOL SUS\.': 'OSS',
                        'G-IN SCHOOL SUSPENSIONS|Q-ECO\. DISADV\. IN SCHOOL SUS\.|L-SPEC\. ED\. IN SCHOOL SUS\.': 'ISS'},
                    "HEADING NAME": {'SPEC\. ED.*$': 'SPE',
                                     'ECO?. DISAD.*$': 'ECO',
                                     'HISPANIC': 'HIS',
                                     'HIS/LATINO': 'HIS',
                                     'HISPANIC/LATINO': 'HIS',
                                     'BLACK OR AFRICAN AMERICAN': 'BLA',
                                     'BLACK/AFRICAN AMERICAN': 'BLA',
                                     'WHITE': 'WHI',
                                     'AMERICAN INDIAN OR ALASKA NAT': 'IND',
                                     'ASIAN': 'ASI',
                                     'NATIVE HAWAIIAN/OTHER PACIFIC': 'PCI',
                                     'TWO OR MORE RACES': 'TWO',
                                    }
                    }

    apple = apple.replace(to_replace=appleReplace, regex=True)
    
    return apple

def getYear(year):
    year_col = "YR{}".format(str(year)[-2:])
    apple_path = '../data/from_agency/by_region/REGION_{}_DISTRICT_summary_{}.csv'
    one_year = [pd.read_csv(apple_path.format(str(region).zfill(2),str(year)[-2:]), 
                            index_col = ["DISTRICT","HEADING"], dtype = {year_col: int})
                for region in range(1,21)]
    a = pd.concat(one_year)
    
    # a = a.set_index(["DISTRICT","HEADING"] )
    a = a[~a.index.duplicated(keep='last')]  # a single row was causing a non-unique multiindex error 
    # print(a.loc[31901])
    a = formatDF(a, year_col)
    return a


In [2]:
def populations(districtPath):
    district = pd.read_csv(districtPath, index_col="DISTRICT")

    district = district.rename(columns = {"SNAPDIST": 'DISTNAME'})
    
    sometimes_missing = [ 'DPETINDP', 'DPETASIP', 'DPETPCIP', 'DPETTWOP']
    
    for c in sometimes_missing:
        if c not in district.columns:
            district[c] = np.nan
    
    # deleting redundant columns
    
    district = district[['DISTNAME', 'REGION', 'DPETALLC', 
                         'DPETBLAP', 'DPETHISP', 'DPETWHIP', 'DPETINDP',
                         'DPETASIP', 'DPETPCIP', 'DPETTWOP', 'DPETECOP', 
                         'DPETSPEP']] # 'DISTRICT' not listed because it's the index

    return district




def getRatio(distPop, racePop, all_punishments, group_punishments):
    # Calculating ratio of punishments for the demographic group compared to the punishments for the student population
    # as a whole. For instance, "0.505" in the disparity column indicates the group got the punishment 50.5% as often
    # as average for the student population.

    """
    >>> getRatio(200, 20, 20, 10)
    4.0
    >>> getRatio(200, 20, 20, 2)
    0.0
    >>> print(getRatio(200, 0, 20, 0))
    None
    """

    if max(racePop, group_punishments) == 0 or None:
        return None
    elif all_punishments == 0 or None:
        return 0
    else:
        disparity = (group_punishments / (max(all_punishments, group_punishments))
                     / (max(racePop, group_punishments) / distPop)) - 1
        disparity = Decimal(disparity)
        disparity = disparity.quantize(Decimal('0.01'))
    return float(disparity)

In [3]:
def impossible(distPop, raceP, all_punishments, group_punishments):

    """
    >>> print(impossible(50, 20, 20, 100))
    1
    >>> impossible(20, 0, 20, 0)
    0
    """

    # The "RecordError" column flags implausible data entries. Some of them could still be true if school administrators
    # applied different standards different standards to determine which students belong to which demographic group.
    # Or some could be the result of students not being counted because of the time they moved in and out of district.

    if group_punishments > max(all_punishments,8): # eight because TEA could report 2 masked columns with 4 each
        return 1
    if raceP == 0 and group_punishments > 0:
        return 1
    if distPop == 0 or None or np.nan:
        return 1
    return 0


def getFisher(distPop, racePop, all_punishments, group_punishments):

    """
    >>> getFisher(20, 5, 20, 10)
    2
    >>> getFisher(20, 0, 20, 0)
    None
    """
    
    # I don't know if this is a valid way to report the Fisher's exact test statistic, but the idea is that if getFisher returns a
    # positive number over .95, there's a 95% chance that the group's better-than-average treatment is not due to chance.
    # If it returns a number under -.95, there's a 95% chance that the group's worse-than-average treatment is not due to chance.
    # I think it should be easier to create a color scale to show the scores on a map this way.

    # The getFisher function assumes wrongly that everyone can have only one punishment (of each type). If the number of
    # punishments exceeds the number of kids, it reduces the number of punishments (and assumes wrongly that every
    # kid has been punished) But maybe the results are still close enough to correct to use for scaling?

    
    if max(racePop, group_punishments) == 0 or None:
        return None
    if distPop == 0:
        return None
    elif max(group_punishments, all_punishments) == 0 or None:
        return 0
    else:
        try: 
            oddsratio, pvalueG = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='greater')
        except ValueError:
            print(distPop, racePop, all_punishments, group_punishments)
        oddsratio, pvalueL = stats.fisher_exact([[racePop, max(distPop - racePop, 0)],
                                                 [group_punishments, max(all_punishments - group_punishments, 0)]],
                                                alternative='less')
        if pvalueL < pvalueG:
            pv = 1 - pvalueL
        else:
            pv = pvalueG - 1
        
        # To save space in the JSON, this simplifies the decimal values to an integer from -6 to 6
        # It should replace similar code in txappleseedmap/js/index.js
        
        scale = -6
        scale_colors = (-0.99999,-0.9984,-0.992,-0.96,-0.8,-0.2,0.2,0.8,0.96,0.992,0.9984,0.99999)
        
        for v in scale_colors:
            if pv > v:
                scale += 1
        
        # pv = Decimal(pv)
        # pv = pv.quantize(Decimal('0.000001'))
    return scale

print(getFisher(20, 5, 20, 10))

2


In [4]:
import json
with open("../geojson/base_districts.geojson") as json_data:
    district_map = json.load(json_data)
    json_data.close()

In [5]:
shapeIDs = set()

for shape in district_map["features"]:
    shape["id"] = shape["properties"]["DISTRICT_N"]
    assert shape["id"] not in shapeIDs, "id already in list: %r" % shape["id"]
    shapeIDs.add(shape["id"])
    
    # These two fields look redundant. Let's try deleting them.
    
    shape["properties"].pop("DISTRICT_1", None)
    shape["properties"].pop("OBJECTID_1", None)


In [6]:
type(district_map["features"][1]['geometry']['coordinates'][0][1][1])

float

In [7]:
# district.loc[67908]["DPETALLC"]

In [8]:
# len(apple["DISTRICT"].unique())

In [8]:
# For districts overall, need columns that show what percentage of the state population they have
# and what percentage of the punishments?

def getLE(x):
    
    # Collects the correct values from the dataframes called "apple" and "district"
    # and calls the "impossible" function, which looks for data errors
    
    distPop = x["DPETALLC"]
    if distPop == None or np.nan or 0:
        return 1
    elif x["HEADING NAME"] == "ALL":
        return 0
    else:    
        all_punishments = x["all_punishments"]
        # all_punishments = apple.loc[x["DISTRICT"]][x["SECTION"]]["ALL"]
        group_punishments = x[year_col]
        # trying to make this run faster by returning info for two columns, then splitting them
        raceP = x["DPET{}P".format(x["HEADING NAME"][:3])]
        return impossible(distPop, raceP, all_punishments, group_punishments)


In [9]:
def getScale(x, punishment_totals, statewide_students_count):
    
    """
    This function does something very different for the "HEADING NAME == ALL" rows than for the other rows.
    For the "ALL" rows it uses the whole state population as the "distPop" and uses the entire district population
    as the "racePop". For the other rows, the entire district population is used as "distPop", not "racePop".
    
    And this function calls getFisher for the real calculation.
    """
    
    group_punishments = x[year_col]
    if x["HEADING NAME"] == "ALL":
        distPop = statewide_students_count
        racePop = x["DPETALLC"]
        all_punishments = punishment_totals[x["SECTION"]]
    else:
        distPop = x["DPETALLC"]
        racePop = x["DPET{}P".format(x["HEADING NAME"])] * distPop * .01
        if pd.isna(racePop):
            return None
        all_punishments = x["all_punishments"]
    # all_punishments = apple.loc[x["DISTRICT"]][x["SECTION"]]["ALL"]
    # print((distPop, racePop, all_punishments, group_punishments))
    if all_punishments == np.nan or None:
        print(x)
    if distPop == np.nan or None:
        print(x)
    return getFisher(distPop, racePop, all_punishments, group_punishments)

def getPercentage(x):
    return x[year_col] / x["all_punishments"] * 100

In [1]:
# Need to merge columns of apple and district.

years = [x for x in range(first_year, last_year + 1)] # change back to first_year

pop_stats = ("DPETALLC","DPETBLAP","DPETHISP","DPETWHIP","DPETINDP","DPETASIP","DPETPCIP",
             "DPETTWOP","DPETECOP","DPETSPEP")

demos = ('ALL','SPE', 'ECO','HIS','BLA', 'WHI','IND', 'ASI','PCI', 'TWO')

punishments = ('EXP','DAE','OSS','ISS')

fail = {} # for testing
noScale = {}

for year in years:
    print("starting year " + str(year))

    apple = getYear(year)
    # the path to the files in the district demographics directory
    districtPath = '../data/from_agency/districts/district{}.dat'.format(year)
    district = populations(districtPath)

    statewide_students_count = district["DPETALLC"].sum()
    year_col = "YR" + str(year)[-2:]

    apple = apple.reset_index()
    apple = apple.merge(district, how = "left", left_on = "DISTRICT", right_index = True)
    
    apple = apple[apple["DPETALLC"].notnull()]
    
    punishment_totals = {}
    for p in apple["SECTION"].unique():
        punishment_totals[p] = apple[apple["SECTION"] == p][apple["HEADING NAME"] == "ALL"][year_col].sum()
        
    # print(apple[18460:18470]) # previous problem rows, gone because of the .notnull()
    # This line will run slowly because for each row, it searches the entire dataframe
    apple["all_punishments"] = apple.apply(lambda x: 
                                           apple[apple["DISTRICT"] == x["DISTRICT"]][apple["SECTION"] == x["SECTION"]][apple["HEADING NAME"] == "ALL"][year_col].values[0], axis=1)

    
    apple["LikelyError"] = apple.apply(getLE, axis=1)
    apple["Scale"] = apple.apply(lambda x: getScale(x, punishment_totals, statewide_students_count), axis=1)
    apple["Percentage"] = apple.apply(getPercentage, axis=1).round(2)
    
    apple = apple.set_index(["DISTRICT","SECTION","HEADING NAME"])
    apple = apple.sort_index() # trying to improve speed
    print(apple[:20])
    for entry in district_map["features"]:
        entry["properties"][year] = {}
        for stat in pop_stats:
            # This will give NaN (numpy.float64) when empty
            try:
                entry["properties"][year][stat] = district.loc[entry["id"]][stat]
            except KeyError:
                # for when the map has a district not in the TEA's data
                print("no stats for " + str(year) + " " + str(entry["id"]))
                entry["properties"][year][stat] = None
        for punishment in punishments:
            entry["properties"][year][punishment] = {}
            for demo in demos:
                entry["properties"][year][punishment][demo] = {} # changed since last run
                try:
                    entry["properties"][year][punishment][demo]["C"] = int(apple.loc[entry["id"],punishment,demo][year_col])
                    entry["properties"][year][punishment][demo]["E"] = int(apple.loc[entry["id"],punishment,demo]["LikelyError"])
                    entry["properties"][year][punishment][demo]["P"] = int(apple.loc[entry["id"],punishment,demo]["Percentage"])
                except:
                    fail[entry["id"]] = (year,punishment,demo)
                try:
                    entry["properties"][year][punishment][demo]["S"] = int(apple.loc[entry["id"],punishment,demo]["Scale"])
                except:
                    noScale[entry["id"]] = (year,punishment,demo)
                    # print("Nothing for {} {} {}".format(entry["id"],punishment,demo))
                    # impossible(distPop, racePop, all_punishments, group_punishments)

NameError: name 'first_year' is not defined

In [51]:
int(apple.loc[entry["id"],punishment,demo][year_col])

20

In [53]:
apple.loc[entry["id"],punishment,demo]["Scale"][0]

-3

In [43]:
district_map["features"][2]

{'geometry': {'coordinates': [[[-96.46278835314988, 31.229404451711943],
    [-96.46107335146424, 31.228522452156774],
    [-96.45706035109677, 31.22231445110417],
    [-96.45896635051133, 31.221328449988],
    [-96.45746535100247, 31.218612450212767],
    [-96.45965335180645, 31.21643444999884],
    [-96.46935835367148, 31.212188447957075],
    [-96.4630813515068, 31.201666446642122],
    [-96.46832535285193, 31.198995445171388],
    [-96.47131335362265, 31.19848944536083],
    [-96.47498935356965, 31.195426444633284],
    [-96.47804835381092, 31.193795444272293],
    [-96.47910535456226, 31.191731443783556],
    [-96.484743356229, 31.19086844385786],
    [-96.49201935771202, 31.187761442354365],
    [-96.49267935846122, 31.186593442621557],
    [-96.4976593597751, 31.184624442100752],
    [-96.49930836013114, 31.185913442028163],
    [-96.51403036404001, 31.17854843950351],
    [-96.51298136230784, 31.176915440202517],
    [-96.52417936618637, 31.171562438348143],
    [-96.5150573615

In [31]:
with open('../geojson/districts_with_data.geojson', 'w') as fp:
    json.dump(district_map, fp, sort_keys=True, default=int)
    fp.close()

TypeError: '<' not supported between instances of 'int' and 'str'

In [None]:
district_map["features"][3]

In [None]:
district.loc[67908]

In [None]:
len(fail)

In [None]:
len(noScale)

In [None]:
apple.loc[67908, "DAE", "WHI"]

In [None]:
apple[2901:2902]

In [None]:
entry["properties"]#[year][punishment][demo]

In [None]:
# This puts all the years in one dataframe
# But I may not use this, and instead load the years into the GeoJSON from separate dataframes

for year in range(first_year + 1, last_year + 1):
    new_year = getYear(year)
    new_year = new_year.drop(["HEADING NAME","SECTION"], axis = 1)
    apple = pd.merge(apple, new_year, how='outer', left_index = True, right_index = True,
                     suffixes = (str(year-1), str(year))).fillna(value = 0) # left_index = True, right_index=True, 

In [None]:
apple[1050:1065]

In [None]:
district.head()

In [None]:
# df[df['A'] > 0]

q = district_map["features"][900]["properties"]["DISTRICT_N"]

district.loc[q]

# district[district["DISTRICT"] == 167903]