In [5]:
# import libraries you might need
import pandas as pd;
import numpy as np;
import math;
import re; # regex

# Format new campus and district entity names

In `scuole`, we have an `entities.csv` file at the district and county level that contains details used to create District and County models. Each year, we need to update it because districts and schools can get renamed/removed/added.

Here's the data we're looking at:
- a new entities file for districts, use the newest district `reference.csv` and rename it to `entities_district.csv`
- a new entities file for campuses, use the newest campus `reference.csv` and rename it to `entities_campus.csv`

We take the campus and district names from these files and format them. The formatted names are in the column `XX_CLEAN`

In [None]:
# initial format of the reference files

# load in files
reference_19_20 = pd.read_csv('reference-campus-19-20.csv')
reference_19_20 = pd.read_csv('reference-district-19-20.csv')

# edit the reference file
reference_edited_rating = pd.read_csv('reference-campus-19-20.csv')

# fill in zeros for ID
reference_edited_rating['CAMPUS'] = reference_edited_rating['CAMPUS'].apply(lambda x: str(x).zfill(9))
reference_edited_rating.to_csv('reference-campus-edited.csv', index=False)
reference_edited_rating = pd.read_csv('reference-district-19-20.csv')

# fill in zeros for ID
reference_edited_rating['DISTRICT'] = reference_edited_rating['DISTRICT'].apply(lambda x: str(x).zfill(6))
reference_edited_rating.to_csv('reference-district-edited.csv', index=False)

In [6]:
# If the name is MCKINLEY
# We want to make sure McKinley is shown
def uppercase_mc(name_portion):
    mc_re = r'(?i)\b(?P<mc>ma?c)(?!hin)(?P<first_letter>\w)\w+'
    matches = re.search(mc_re, name_portion)

    if matches:
        mc = matches.group('mc')
        first_letter = matches.group('first_letter')
        return re.sub(mc + first_letter, mc.title() + first_letter.upper(), name_portion)
    else:
        return name_portion

In [9]:
# DISTRICT ENTITIES
# read district files
entities_district = pd.read_csv('entities_district.csv')

In [10]:
# add the modified name in column 'DISTNAME_CLEAN'
# correct district names with 'Mc' in them (i.e. Mckinley -> McKinley)
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME'].apply(lambda x: re.sub('\s+Isd*(?!.)', ' ISD', x.title()))
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME_CLEAN'].apply(lambda x: re.sub('\s+Cisd*(?!.)', ' CISD', x))
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME_CLEAN'].apply(lambda x: re.sub('\s+Csd*(?!.)', ' CSD', x))
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME_CLEAN'].apply(lambda x: re.sub('\s+Llc*(?!.)', ' LLC', x))
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME_CLEAN'].apply(lambda x: re.sub("'S", "'s", x)) # needs to come after title casing
entities_district['DISTNAME_CLEAN'] = entities_district['DISTNAME_CLEAN'].apply(lambda x: uppercase_mc(x))
                         
# NEED TO FORMAT THE COUNTY AND REGION
entities_district['COUNTY'] = entities_district['COUNTY'].astype(int)
entities_district['REGION'] = entities_district['REGION'].astype(int)

entities_district = entities_district.drop(columns=['D_RATING'])

# write to a CSV
# entities_district.head()
entities_district.to_csv('entities_district_edited.csv', index=False)

In [13]:
# CAMPUS ENTITIES
# read campus files
entities_campus = pd.read_csv('entities_campus.csv')

In [14]:
# for campuses with no modified name, add the modified name in column 'CAMPNAME_CLEAN'
# do a bunch of replacements for abbreviated campus names
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME'].apply(lambda x: re.sub('\s+H S*(?!.)', ' High School', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+J H*(?!.)', ' Junior High', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+EL*(?!.)', ' Elementary School', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+PRI*(?!.)', ' Primary School', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+LRN CTR*(?!.)', ' Learning Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+DETENT CTR*(?!.)', ' Detention Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+EDUC CTR*(?!.)', ' Education Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+MIDDLE*(?!.)', ' Middle School', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+INT*(?!.)', ' Intermediate School', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+AEC*(?!.)', ' Alternative Education Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+JUSTICE C*(?!.)', ' Justice Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+CTR*(?!.)', ' Center', x))
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub('\s+CEN*(?!.)', ' Center', x))

# title case everything
# excluse JJAEP, DAEP, PK-8, J J A E P
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: x.title().replace('Jjaep', 'JJAEP').replace('Daep', 'DAEP').replace('Pk-8', 'PK-8').replace('Pk - 8', 'PK-8').replace('J J A E P', 'JJAEP'))

# replace 'S with 's
# needs to come after title casing
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: re.sub("'S", "'s", x))

# correct campus names with 'Mc' in them (i.e. Mckinley -> McKinley)
entities_campus['CAMPNAME_CLEAN'] = entities_campus['CAMPNAME_CLEAN'].apply(lambda x: uppercase_mc(x))

# NEED TO FORMAT THE COUNTY AND REGION
entities_campus['COUNTY'] = entities_campus['COUNTY'].astype(int)
entities_campus['DISTRICT'] = entities_campus['DISTRICT'].astype(int)

entities_campus = entities_campus.drop(columns=['C_RATING'])

# write to a CSV
entities_campus.to_csv('entities_campus_edited.csv', index=False)

In [25]:
# helpful to read about python apply lambda functions, iloc, and loc