In [207]:
import orca
import pandana as pdna
import pandas as pd
import scipy.stats as st
import numpy as np

from urbansim.utils import networks
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep

from urbansim.utils import misc


import warnings
warnings.filterwarnings("ignore")

## Loading the data

In [208]:
# Set data directory
d = '/home/data/fall_2018/'

if 'data_directory' in orca.list_injectables():
    d = orca.get_injectable('data_directory')
    
# School database 
schools = pd.read_csv('schools.csv')

@orca.table(cache=True)
def persons():
    df = pd.read_csv(
        d + 'chts_persons_w_zone_ids.csv',
        index_col = ["SAMPN", "PERNO"]
    )
    return df

In [254]:
import os
os.listdir("/home/data/CHTS_csv_format/data")

['Deliv_HH.csv',
 'Deliv_ACTIVITY.csv',
 'Deliv_LD.csv',
 'LookUp_LD.csv',
 'LookUp_Home.csv',
 'ASSN_TravelDate.csv',
 'LookUp_PLACE.csv',
 'LookUp_PER.csv',
 'Deliv_PER.csv',
 'Deliv_VEH.csv',
 'Deliv_PLACE.csv']

## Cleaning school database for merging 

In [228]:
# School database 
schools['School'] = schools['School'].str.lower()
schools['City'] = schools['City'].str.lower()
schools.Zip = [x[:5] for x in schools.Zip]
schools['school_city_zip'] = schools['School'] + ' '+ schools['City'] + '' + schools['Zip']
schools.head(5)

Unnamed: 0.1,Unnamed: 0,CDSCode,School,District,County,Street,City,Zip,State,Latitude,...,Total Enrollment,sw_rank_2108,sw_rank_2017,sw_rank_2016,ss_rank_2018_,ss_rank_2017,ss_rank_2016,nodeID,school_zip,school_city_zip
0,0,1100170112607,envision academy for arts & technology,Alameda County Office of Education,Alameda,1515 Webster Street,oakland,94612,CA,37.80452,...,403.0,4.0,3.0,2.0,,5.0,4.0,53082889,envision academy for arts & technology 94612,envision academy for arts & technology oakland...
1,1,1100170123968,community school for creative education,Alameda County Office of Education,Alameda,2111 International Boulevard,oakland,94606,CA,37.784648,...,208.0,2.0,1.0,2.0,,1.0,5.0,53087346,community school for creative education 94606,community school for creative education oaklan...
2,2,1100170124172,yu ming charter,Alameda County Office of Education,Alameda,1086 Alcatraz Avenue,oakland,94608,CA,37.847375,...,357.0,10.0,10.0,10.0,,6.0,6.0,53126420,yu ming charter 94608,yu ming charter oakland94608
3,3,1100170125567,urban montessori charter,Alameda County Office of Education,Alameda,5328 Brann Street,oakland,94619,CA,37.778352,...,374.0,6.0,6.0,6.0,,1.0,1.0,53097173,urban montessori charter 94619,urban montessori charter oakland94619
4,4,1100170130401,alameda county juvenile hall/court,Alameda County Office of Education,Alameda,2500 Fairmont Avenue,san leandro,94578,CA,37.712878,...,87.0,,,,,,,718372425,alameda county juvenile hall/court 94578,alameda county juvenile hall/court san leandro...


##  Cleaning CHTS 

In [252]:

#Create a dataframe selects person that are students. 
students = persons.loc[persons['STUDE'].isin([1, 2])]# full time & part time students
#                  & persons['SCHOL'].isin([3, # Kindergarten to grade 8
#                                           4])  # Grades 9 to 12 
#                  ]
students.shape

(5944, 191)

In [231]:
#Create a dataframe for persons data. 
persons = orca.get_table('persons').to_frame()

#Create a dataframe selects person that are students. 
students = persons.loc[persons['STUDE'].isin([1, 2])# full time & part time students
                 & persons['SCHOL'].isin([3,  # Kindergarten to grade 8
                                          4])  # Grades 9 to 12 
                 & (~persons['SNAME_lookup'].isna()) 
                 & (persons['SNAME_lookup'] != "DK/RF")]

#CLEANING: 
#School and city name in lowercase 
students['SNAME_lookup'] = students['SNAME_lookup'].str.lower()
students['SCITY_lookup'] = students['SCITY_lookup'].str.lower()

#String zipcode with 5 characters. 
students['SZIP_lookup'] = students['SZIP_lookup'].astype(str)
students['SZIP_lookup'] = [x[:5] for x in students['SZIP_lookup']]

#Deleting the word school from school name
students['SNAME_lookup'] = [x.replace(' school', '') for x in students['SNAME_lookup']]

#Creting school + zipcode in 1 column
students['NAME_CITY_ZIP_lookup'] = students['SNAME_lookup'] + ' ' +students['SCITY_lookup']+' ' + students['SZIP_lookup']

#CREATING A CHTS SCHOOL DATABASE
# schools_chts = students.groupby(by=["SNAME_lookup", "SZIP_lookup"]).agg({"RELAT": 'count'})
schools_chts = students.groupby(by=["SNAME_lookup", "SZIP_lookup"]).size().reset_index(name='enrollment')
students

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,index,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,...,HYCORD,HBLOCK,HTRACT,HCTFIP,HPrimaryCity,parcel_id_home,parcel_id_work,zone_id_home,zone_id_work,NAME_CITY_ZIP_lookup
SAMPN,PERNO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1037952,2,6,42,3,1,17,,1,1.0,,,...,37.467578,60816119006001,611900,81,PALO ALTO,1195607,,333,,good side high menlo park 94025
1050385,3,33,172,3,1,14,,2,1.0,,,...,37.537021,60816103042015,610304,81,REDWOOD CITY,1257253,,311,,charles armstrong belmont 94002
1050385,4,34,173,3,1,10,,2,1.0,,,...,37.537021,60816103042015,610304,81,REDWOOD CITY,1257253,,311,,st charles san carlos 94070
1051388,3,41,185,3,1,10,,2,1.0,,,...,37.767279,60750171011002,17101,75,SAN FRANCISCO,913377,,89,,presidio hill san francisco 94118
1051795,3,44,193,3,1,13,,2,1.0,,,...,37.757638,60750326021008,32602,75,SAN FRANCISCO,938197,,177,,presidio san francisco 94129
1059276,2,52,265,3,1,15,,1,1.0,,,...,38.046522,60411050001015,105000,41,NOVATO,814151,,1407,,novato high novato 94947
1173970,3,55,5920,3,1,8,,1,1.0,,,...,38.045329,60411050001015,105000,41,NOVATO,814151,,1407,,hamilton elementary novato 94949
1173970,4,56,5921,3,2,6,,1,1.0,,,...,38.045329,60411050001015,105000,41,NOVATO,814151,,1407,,hamilton elementary novato 94949
1064644,3,66,335,3,1,15,,1,1.0,,,...,37.692202,60816012004001,601200,81,DALY CITY,1052397,,193,,capatino high san bruno 94030
1088852,3,99,430,3,1,11,,2,1.0,,,...,37.915146,60133383011027,338301,13,WALNUT CREEK,396616,,1140,,walnut acres walnut creek 94598


In [216]:
list(a)

['Unnamed: 0',
 'index',
 'RELAT',
 'GEND',
 'AGE',
 'AGEB',
 'HISP',
 'RACE1',
 'RACE2',
 'RACE3',
 'RACE4',
 'O_RACE',
 'NTVTY',
 'CNTRY',
 'LIC',
 'USER',
 'TRANS',
 'TPTYP1',
 'TPTYP2',
 'TPTYP3',
 'TPTYP4',
 'TPTYP5',
 'TPTYP6',
 'TPTYP7',
 'O_TPTYP',
 'CLIP1',
 'CLIP2',
 'CLIP3',
 'COMP',
 'MET',
 'PASSTL',
 'FLEX',
 'EMPLY',
 'WKSTAT',
 'O_WKSTAT',
 'JOBS',
 'WLOC',
 'WNAME_persons',
 'WCITY_persons',
 'WSTAT',
 'WZIP_persons',
 'WXST1_persons',
 'WXST2_persons',
 'WXCORD_persons',
 'WYCORD_persons',
 'WDAYS',
 'WDAY1',
 'WDAY2',
 'WDAY3',
 'WDAY4',
 'WDAY5',
 'WDAY6',
 'WDAY7',
 'HOURS',
 'WSCHED',
 'COMPR',
 'WMODE',
 'INDUS',
 'O_INDUS',
 'OCCUP',
 'O_OCCUP',
 'WLOC2',
 'WNAME2_persons',
 'WCITY2_persons',
 'WSTAT2',
 'WZIP2_persons',
 'WXST2_1_persons',
 'WXST2_2_persons',
 'WDAYS2',
 'DISAB',
 'DTYPE1',
 'DTYPE2',
 'DTYPE3',
 'DTYPE4',
 'DTYPE5',
 'DTYPE6',
 'DTYPE7',
 'O_DTYPE',
 'DSLIC',
 'EDIS',
 'TTRIP',
 'TRNSUB',
 'SUBAMT',
 'SUBUNT',
 'O_SUBUNT',
 'WTRIP',
 'BTRIP',


## Merging CHTS and school dataset

In [135]:
merge_shcool = pd.merge(schools, schools_chts, 
                    how="right", left_on=['School', 'Zip'],
                       right_on = ['SNAME_lookup', 'SZIP_lookup'])

merge_shcool.loc[:,['SNAME_lookup', 'SZIP_lookup', 'CDSCode']]
merge_shcool.head()

# merge_shcool.to_csv('merge_shcool.csv') #Exporting the database to excel

Unnamed: 0.1,Unnamed: 0,CDSCode,School,District,County,Street,City,Zip,State,Latitude,...,sw_rank_2108,sw_rank_2017,sw_rank_2016,ss_rank_2018_,ss_rank_2017,ss_rank_2016,nodeID,SNAME_lookup,SZIP_lookup,enrollment
0,20.0,1611190130229,alameda high,Alameda Unified,Alameda,2201 Encinal Avenue,Alameda,94501,CA,37.764294,...,9.0,10.0,10.0,,5.0,7.0,53127640.0,alameda high,94501,6
1,30.0,1611196090039,franklin elementary,Alameda Unified,Alameda,1433 San Antonio Avenue,Alameda,94501,CA,37.769036,...,9.0,9.0,9.0,,7.0,5.0,295969400.0,franklin elementary,94501,3
2,31.0,1611196090047,henry haight elementary,Alameda Unified,Alameda,2025 Santa Clara Avenue,Alameda,94501,CA,37.769332,...,8.0,7.0,7.0,,7.0,6.0,53127640.0,henry haight elementary,94501,2
3,32.0,1611196090054,lincoln middle,Alameda Unified,Alameda,1250 Fernside Boulevard,Alameda,94501,CA,37.752242,...,10.0,10.0,10.0,,8.0,7.0,2430995000.0,lincoln middle,94501,2
4,38.0,1611270130450,albany high,Albany City Unified,Alameda,603 Key Route Boulevard,Albany,94706,CA,37.896661,...,10.0,10.0,10.0,,9.0,10.0,261736300.0,albany high,94706,16


In [232]:
aa = [similar(a,x) for x in schools['School']]

## Be careful, this codes takes ages to run. 
max_probabilities = []
max_probabilities_index = []

for school in students['NAME_CITY_ZIP_lookup']:
    prob = []
    for name in schools['school_city_zip']:
        similar_probaility = similar(name, school)
        prob.append(similar_probaility)
    
    max_prob = np.max(prob)
    max_prob_index = np.argsort(prob)[-10:]
    max_probabilities.append(max_prob)
    max_probabilities_index.append(max_prob_index)

len(max_probabilities)

3583

In [233]:
# len(school_zipcode)
len(max_probabilities)

3583

In [245]:
max_probabilities_index[0]

schools['School'][max_probabilities_index[0][9]]
school_name = [schools['School'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]
school_zipcode = [schools['Zip'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]
school_city = [schools['City'][max_probabilities_index[x][9]] for x in range(len(max_probabilities_index))]

In [248]:
students['SNAME_similarity_resulst'] = school_name
students['ZipCode'] = school_zipcode
students['CityName'] = school_city
students['similarity'] = max_probabilities
students.loc[:,['SNAME_lookup', 'SZIP_lookup','SCITY_lookup' ,'SNAME_similarity_resulst', 'ZipCode', 'CityName','similarity' ]].to_csv('merge_school.csv')

students.loc[:,['SNAME_lookup', 'SZIP_lookup','SCITY_lookup' ,'SNAME_similarity_resulst', 'ZipCode', 'CityName','similarity' ]]

Unnamed: 0_level_0,Unnamed: 1_level_0,SNAME_lookup,SZIP_lookup,SCITY_lookup,SNAME_similarity_resulst,ZipCode,CityName,similarity
SAMPN,PERNO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1037952,2,good side high,94025,menlo park,beechwood school,94025,menlo park,0.698413
1050385,3,charles armstrong,94002,belmont,charles armstrong school,94002,belmont,0.882353
1050385,4,st charles,94070,san carlos,st. charles elementary,94070,san carlos,0.800000
1051388,3,presidio hill,94118,san francisco,presidio hill school,94118,san francisco,0.888889
1051795,3,presidio,94129,san francisco,presidio middle,94121,san francisco,0.838710
1059276,2,novato high,94947,novato,novato high,94947,novato,0.978723
1173970,3,hamilton elementary,94949,novato,lu sutton elementary,94947,novato,0.812500
1173970,4,hamilton elementary,94949,novato,lu sutton elementary,94947,novato,0.812500
1064644,3,capatino high,94030,san bruno,capuchino high,94066,san bruno,0.827586
1088852,3,walnut acres,94598,walnut creek,walnut acres elementary,94598,walnut creek,0.833333


In [82]:
# schools_raw.reset_index(inplace=True)  
# schools_raw['SNAME_lookup'] = schools_raw['SNAME_lookup'].str.lower()
# schools_raw

# schools_raw.SNAME_lookup.value_counts()

In [83]:
# school_name = [x.replace(' school', '') for x in schools_raw.SNAME_lookup]
# schools_raw['SNAME_lookup'] = school_name
# schools_raw.SNAME_lookup.value_counts()
# schools_raw[schools_raw.SNAME_lookup == 'santa rosa junior college']

In [31]:
# merge_shcool = pd.merge(schools, schools_raw, 
#                     how="inner", left_on=['School', 'Zip'],
#                        right_on = ['SNAME_lookup', 'SZIP_lookup'])

# merge_shcool = pd.merge(schools, schools_raw, 
#                     how="inner", left_on=['School'],
#                        right_on = ['SNAME_lookup'])

merge_shcool = pd.merge(schools, schools_raw, 
                    how="inner", left_on=['School'],
                       right_on = ['SNAME_lookup'])

merge_shcool

#look schools that are similar 

aa = [similar(a,x) for x in schools['School']]

for school in 

Unnamed: 0.1,Unnamed: 0,CDSCode,School,District,County,Street,City,Zip,State,Latitude,...,sw_rank_2016,ss_rank_2018_,ss_rank_2017,ss_rank_2016,nodeID,index,SCHOL,SNAME_lookup,SZIP_lookup,enrollment
0,20,1611190130229,alameda high,Alameda Unified,Alameda,2201 Encinal Avenue,Alameda,94501,CA,37.764294,...,10.0,,5.0,7.0,53127645,140,4.0,alameda high,94501,3
1,30,1611196090039,franklin elementary,Alameda Unified,Alameda,1433 San Antonio Avenue,Alameda,94501,CA,37.769036,...,9.0,,7.0,5.0,295969404,42,3.0,franklin elementary,94501,3
2,283,1612596001820,franklin elementary,Oakland Unified,Alameda,915 Foothill Boulevard,Oakland,94606,CA,37.793794,...,5.0,,2.0,3.0,53038428,42,3.0,franklin elementary,94501,3
3,959,41688826043541,franklin elementary,Burlingame Elementary,San Mateo,2385 Trousdale Drive,Burlingame,94010,CA,37.588922,...,10.0,,7.0,8.0,65497027,42,3.0,franklin elementary,94501,3
4,1288,43694506047195,franklin elementary,Franklin-McKinley Elementary,Santa Clara,420 Tully Road,San Jose,95111,CA,37.305453,...,8.0,,5.0,7.0,313095361,42,3.0,franklin elementary,94501,3
5,38,1611270130450,albany high,Albany City Unified,Alameda,603 Key Route Boulevard,Albany,94706,CA,37.896661,...,10.0,,9.0,10.0,261736298,141,4.0,albany high,94706,6
6,38,1611270130450,albany high,Albany City Unified,Alameda,603 Key Route Boulevard,Albany,94706,CA,37.896661,...,10.0,,9.0,10.0,261736298,142,4.0,albany high,94706,10
7,40,1611276090161,albany middle,Albany City Unified,Alameda,1259 Brighton Avenue,Albany,94706,CA,37.897851,...,10.0,,7.0,7.0,53024965,3,3.0,albany middle,94706,4
8,41,1611276095376,marin elementary,Albany City Unified,Alameda,1001 Santa Fe Avenue,Albany,94706,CA,37.888559,...,10.0,,4.0,5.0,53056317,73,3.0,marin elementary,94706,5
9,44,1611430131177,berkeley high,Berkeley Unified,Alameda,1980 Allston Way,Berkeley,94704,CA,37.868913,...,9.0,,3.0,8.0,53116966,157,4.0,berkeley high,94704,3


In [32]:
merge_shcool.shape

(225, 53)

In [100]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [122]:
a = 'georgina p blach'
b = 'waldorf school of the peninsula'

similar(a,b)

0.2553191489361702

In [131]:
aa = [similar(a,x) for x in schools['School']]
aa[1350]

0.47619047619047616

In [124]:
schools['School'][np.argsort(aa)][-10:]

1350        george c. payne elementary
322     american indian public charter
2026                     orion academy
1892                 mentoring academy
2412              regina caeli academy
220        american indian public high
2353                   granada islamic
2246        gloria dei lutheran school
2025                    orinda academy
1333     georgina p. blach junior high
Name: School, dtype: object