In [1]:
import orca
import pandana as pdna
import pandas as pd
import scipy.stats as st
import numpy as np
import os

from urbansim.utils import networks
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep

from urbansim.utils import misc

from difflib import SequenceMatcher

import warnings
warnings.filterwarnings("ignore")

# Loading the data

In [33]:
# Set data directory
d = '/home/data/fall_2018/'

if 'data_directory' in orca.list_injectables():
    d = orca.get_injectable('data_directory')
    
# School database 
schools = pd.read_csv('schools.csv')

#CHTS data
@orca.table(cache=True)
def persons():
    df = pd.read_csv(
        d + 'chts_persons_w_zone_ids.csv',
        index_col = ["SAMPN", "PERNO"]
    )
    return df

# Schools standard name transformation 
s_name = pd.read_excel('chts school cleaning.xlsx', 
                       names=['SAMPN', 'PERNO', 'SNAME_lookup_y', 'SNAME',
                                 'SZIP_lookup_y', 'SZIP', 'SCITY_lookup_y', 'SCITY',
                                  'similarity', 'Matched?' ])
# os.listdir("/home/data/CHTS_csv_format/data")

# Cleaning school database

In [34]:
# School database 
schools['School'] = schools['School'].str.lower()
schools['City'] = schools['City'].str.lower()
schools.Zip = [x[:5] for x in schools.Zip]
schools['school_city_zip'] = schools['School'] + ' '+ schools['City'] + '' + schools['Zip']

# Cleaning CHTS 

In [35]:
#Create a dataframe for persons data. 
persons = orca.get_table('persons').to_frame()

#Create a dataframe selects person that are students. 
students = persons.loc[persons['STUDE'].isin([1, 2])# full time & part time students
                 & persons['SCHOL'].isin([3,  # Kindergarten to grade 8
                                          4])  # Grades 9 to 12 
                 & (~persons['SNAME_lookup'].isna()) 
                 & (persons['SNAME_lookup'] != "DK/RF")]

#Standarazing school names
students = pd.merge(students, s_name, how="outer", on=['SAMPN', 'PERNO'])

#Deleting unmatched schools 
students = students[students.SNAME != 'DELETE']
students = students[students['Matched?'] != 'INCLUDE']
students = students.reset_index()

#CLEANING: 
#School and city name in lowercase 
students['SNAME'] = students['SNAME'].str.lower()
students['SCITY'] = students['SCITY'].str.lower()
students['SNAME_lookup'] = students['SNAME_lookup'].str.lower()
students['SCITY_lookup'] = students['SCITY_lookup'].str.lower()

#String zipcode with 5 characters. 
students['SZIP'] = students['SZIP'].astype(str)
students['SZIP'] = [x[:5] for x in students['SZIP']]
students['SZIP_lookup'] = students['SZIP_lookup'].astype(str)
students['SZIP_lookup'] = [x[:5] for x in students['SZIP_lookup']]

# Deleting the word school from school name
students['SNAME_lookup'] = [x.replace(' school', '') for x in students['SNAME_lookup']]

# Creting school + zipcode in 1 column
students['NAME_CITY_ZIP_lookup'] = students['SNAME_lookup'] + ' ' +students['SCITY_lookup']+' ' + students['SZIP_lookup']

# #CREATING A CHTS SCHOOL DATABASE
# # schools_chts = students.groupby(by=["SNAME_lookup", "SZIP_lookup"]).agg({"RELAT": 'count'})
# schools_chts = students.groupby(by=["SNAME", "SZIP", "SCITY"]).size().reset_index(name='enrollment')
# schools_chts
students.shape

(3385, 203)

## Merging CHTS and school dataset

In [63]:
students_standarize_school_name = pd.merge(students, schools.loc[:,['School', 'Zip','CDSCode']], 
                    how="left", left_on=['SNAME', 'SZIP'],
                       right_on = ['School', 'Zip'])

#Checking the number of unique schools
students_standarize_school_name.groupby(by = 'SNAME').size().shape

(1110,)

In [64]:
#Assinging studnets to Keys Family Day School according to their age (Same school name but different grades)
mask = (students_standarize_school_name.CDSCode == '43696416141899') & (students_standarize_school_name.AGE > 10)
students_standarize_school_name = students_standarize_school_name[~mask]

In [65]:
students_standarize_school_name.to_csv('students_standarize_school_name.csv') #Exporting the database to excel

## Additional code
#### Do not run!!

In [69]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

#Calculates the similarity index for each school + city + zipcode
max_probabilities = []
max_probabilities_index = []

for school in students['NAME_CITY_ZIP_lookup']:
    prob = []
    for name in schools['school_city_zip']:
        similar_probaility = similar(name, school)
        prob.append(similar_probaility)
    
    max_prob = np.max(prob)
    max_prob_index = np.argsort(prob)[-10:]
    max_probabilities.append(max_prob)
    max_probabilities_index.append(max_prob_index)

len(max_probabilities)

# Assignes the value with the greates similairty index to each school in CHTS observation. 
schools['School'][max_probabilities_index[0][9]]
school_name = [schools['School'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]
school_zipcode = [schools['Zip'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]
school_city = [schools['City'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]

#Add de assigned values to a unique dataframe
students['SNAME_similarity_resulst'] = school_name
students['ZipCode'] = school_zipcode
students['CityName'] = school_city
students['similarity'] = max_probabilities

#Export restul to excel for human verification 
students.loc[:,['SNAME_lookup', 'SZIP_lookup',
                'SCITY_lookup' ,'SNAME_similarity_resulst',
                'ZipCode', 'CityName','similarity' ]].to_csv('merge_school.csv')