# Calculation of distance to school, metro and jail

## The code is structured as followed
- Calculating distances to a top 20% school
    - Scraping list of best all schools 
    - Use *`geocode`* to get coordinates
    - Calculate distances from each apartment
    - Construct list of minimum distance to top school
- Calculating distances to a metro
- Calculating distances to a jail

In [2]:
import pandas as pd
import numpy as np

## School data
In the first part of this code, we retrieve data on municipalities and which zip code that belong to which municipality.

Because many of the columns are intertwined, we need to split and merge.

In [3]:
# Get zip codes and municipalities from DST
url_post = 'https://www.dst.dk/ext/4393839853/0/kundecenter/Tabel-Postnumre-kommuner-og-regioner--xlsx'
df_muni = pd.read_excel(url_post)
df2_muni = df_muni[4:]
df2_muni.columns = ['Zip', 'Municipality','Region']


# Split data: we want to seperate zip code and village as well as municipality number and municipality
zip_split = pd.DataFrame(df2_muni.Zip.str.split(' ',1).tolist(),
                                   columns = ['Zip','Village'])

mun_split = pd.DataFrame(df2_muni.Municipality.str.split(' ',1).tolist(),
                                   columns = ['Mun. no.','Municipality'])

# Merge data back together
merge = pd.concat([zip_split, mun_split], axis=1, sort=False)
mun_zip = merge[['Zip','Municipality']] 

# Construct new variable that only contain municpalities with zip code below 3000
mun_zip['Int zip'] = mun_zip['Zip'].astype(int)
our_sample = mun_zip[(mun_zip['Int zip'] < 3000)]

# Drop duplicates so we have a simple list of the municipalities we are interested in
municipalities = our_sample['Municipality'].drop_duplicates().reset_index()

# List of our chosen municipalities
municip = pd.DataFrame(municipalities['Municipality'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# Get table of school ranking
url_school = 'https://www.sondagsavisen.dk/familien/2015-08-22-se-hele-listen-her-er-danmarks-bedste-og-vaerste-skole/'
html = pd.read_html(url_school)

In [155]:
df_school = pd.DataFrame(html[0])
head_school = df_school.rename(columns = df_school.iloc[0])
school = head_school[1:]

# Only include schools within our chosen municipalities
schools = school[school['Kommune'].isin(municip['Municipality'])]
n_school = schools.groupby(['Kommune']).size().reset_index(name='Count')


# Find threshold for best schools. The numer of schools are chosen such that 
# we divide number of schools in a municipality with five to get the relative best schools
thresh = [n_school['Kommune'], round(n_school['Count']//5)]
threshold = pd.DataFrame(thresh).transpose()

for i in range(0,len(threshold)):
    if threshold['Count'][i]==0:
        threshold['Count'][i]=1
threshold

# Merge threshold to our schools, so we can exclude schools with ranking above threshold
schools_merge = pd.merge(schools, threshold, how='left',
        left_on='Kommune', right_on='Kommune')

schools_merge['Ranking'] = schools_merge['Placering Kommune'].astype(int)

school_drop = schools_merge[(schools_merge.Ranking <= schools_merge.Count)].reset_index(drop = True)
school_drop
school_final = school_drop.drop(['Placering Kommune', 'UE 2014', 'Placering landsplan','Privat/ Offentlig','Count'], axis =1)

# Rename schools that cannot be found using geocode
new_name = []
for i, row in school_final['Skolenavn'].iteritems():
    if 'Østerhøjskolen' in row:
        new_name.append('Østerhøj skole')
    elif 'Kaptajn Johnsens Skole' in row:
        new_name.append('Lykkesholms Alle 3A')
    elif 'Sct. Joseph Søstrenes Skole S/I' in row:
        new_name.append('Skovkrogen 19')
    elif 'Atheneskolen – skolen for børn med særlige forudsætninger' in row:
        new_name.append('Rosenkæret 22A')
    elif 'Bagsværd Gymnasiums Grundskole' in row:
        new_name.append('Aldershvilevej 138')
    elif 'Greve Privatskole' in row:
        new_name.append('Hundige Bygade 2')
    elif 'Tjørnelyskolen' in row:
        new_name.append('Lillevangsvej 48')
    elif 'Skt. Pauls Skole' in row:
        new_name.append('Sankt Pauls Skole')
    elif 'Ådalens Privatskole' in row:
        new_name.append('Skovvej 15')
    elif 'Rungsted Private Realskole' in row:
        new_name.append('Vallerød Banevej 23')
    elif 'Hay Skolen' in row:
        new_name.append('Sankt Hans Gade 25')
    elif 'Amager’s International School' in row:
        new_name.append('Engvej 141, 2300 København')
    elif 'Jinnah International School' in row:
        new_name.append('Skjulhøj Alle 59')
    elif 'Iqra Privatskole' in row:
        new_name.append('Hermodsgade 28')
    elif 'Copenhagen Euro School' in row:
        new_name.append('Gammel Kongevej 15')
    elif 'Nørre Fælled Skole' in row:
        new_name.append('Biskop Krags Vænge 7')
    elif 'Al Hikma Skolen' in row:
        new_name.append('Ellebjergvej 50')
    elif 'Øresunds Internationale Skole' in row:
        new_name.append('Engvej 153, 2300 København')
    elif 'Sjællands Privatskole' in row:
        new_name.append('Nattergalevej 32')
    elif 'Baunehøjskolen' in row:
        new_name.append('Baunegårdsvej')
    elif 'Dronninggårdskolen' in row:
        new_name.append('Rønnebærvej 33')
    elif 'Uglegårdsskolen – Uglegård afdeling' in row:
        new_name.append('Tingsryds Alle 25')
    else:
        new_name.append(row.split(',', 1)[0]) # split once, keep 1st part

school_final.insert(loc=0, column='School name', value=new_name)

In [156]:
# [Getting latitude and longitude for schools]
# Import packages
!pip3 install tqdm
!pip3 install geopy
import numpy as np
import pandas as pd
import time, tqdm
import geopy.geocoders  # GeoPy - see https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim # retrieve coordinates from addresses etc.
geopy.geocoders.options.default_user_agent = 'my_app/1'
geopy.geocoders.options.default_timeout = 15

geolocator = Nominatim()
# geolocator.headers  # check header
# geolocator.timeout  # check time_out
latitude = []
longitude = []
address = []

for row in tqdm.tqdm(school_final['School name']):
    row_string = str(row)
    location = geolocator.geocode(row_string)
    if isinstance(location, geopy.location.Location):
        latitude.append(float(location.latitude))
        longitude.append(float(location.longitude))
    else:
        print('Not found: ',row_string)
        latitude.append(None)
        longitude.append(None)
school_final.insert(loc=0, column='Latitude', value=latitude)
school_final.insert(loc=0, column='Longitude', value=longitude)

/bin/sh: pip3: command not found
/bin/sh: pip3: command not found



  0%|          | 0/57 [00:00<?, ?it/s][A
  2%|▏         | 1/57 [00:00<00:17,  3.19it/s][A
  4%|▎         | 2/57 [00:00<00:12,  4.56it/s][A
  5%|▌         | 3/57 [00:00<00:10,  5.26it/s][A
  7%|▋         | 4/57 [00:00<00:09,  5.67it/s][A
  9%|▉         | 5/57 [00:00<00:08,  6.02it/s][A
 11%|█         | 6/57 [00:00<00:08,  6.28it/s][A
 12%|█▏        | 7/57 [00:01<00:07,  6.48it/s][A
 14%|█▍        | 8/57 [00:01<00:07,  6.60it/s][A
 16%|█▌        | 9/57 [00:01<00:07,  6.76it/s][A
 18%|█▊        | 10/57 [00:01<00:06,  6.88it/s][A
 19%|█▉        | 11/57 [00:01<00:06,  6.97it/s][A
 21%|██        | 12/57 [00:01<00:06,  7.05it/s][A
 23%|██▎       | 13/57 [00:01<00:06,  7.07it/s][A
 25%|██▍       | 14/57 [00:01<00:06,  7.13it/s][A
 26%|██▋       | 15/57 [00:02<00:05,  7.15it/s][A
 28%|██▊       | 16/57 [00:02<00:05,  7.20it/s][A
 30%|██▉       | 17/57 [00:02<00:05,  7.25it/s][A
 32%|███▏      | 18/57 [00:02<00:05,  7.26it/s][A
 33%|███▎      | 19/57 [00:02<00:05,  7.29it/s]

### Calculate distances to school from apartments

In [466]:
# School coordinates
school_coord = pd.DataFrame([school_final['Longitude'], school_final['Latitude']]).transpose()

##############
# FINAL DATASET FOR APARTMENTS
################

# Apartment coordinates
data_apart = pd.read_csv('/Users/Ngottschalck/Desktop/Mathias/Python/priser.csv')
apartment_coord = pd.DataFrame([data_apart['Longitude'], data_apart['Latitude']]).transpose()

In [467]:
# Get distance between each school and apartment
from geopy.distance import geodesic as dist

school_distance = []
for i in range(0,len(apartment_coord)):
    for p in range(0,len(school_coord)):
        apart_dist = (apartment_coord['Latitude'][i],apartment_coord['Longitude'][i])
        school_dist = (school_coord['Latitude'][p],school_coord['Longitude'][p])
        all_dist = dist(apart_dist,school_dist).km
        school_distance.append(all_dist)

In [468]:
# We construct a function to split data so we have list divided over each apartment with distance to each school
# i.e. split_dist[i] corresponds to apartment i 

def splitDataFrameIntoSmaller(df, chunkSize = 10000): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

split_dist = splitDataFrameIntoSmaller(school_distance,len(school_coord))

In [484]:
min_dist_school = []
for i in range(0,len(split_dist)-1):
    minimum = min(split_dist[i])
    min_dist_school.append(minimum)

## Jail data

In [509]:
# Get jail data from github 
jail_data = pd.read_csv('https://raw.githubusercontent.com/thornoe/sds_2018/master/CPH/Data/jail.csv', sep = ';')/1000000

# Calculate distance from jail to each apartment
jail_distance = []
for i in range(0,len(apartment_coord)):
    for p in range(0,len(jail_data)):
        apart_dist = (apartment_coord['Latitude'][i],apartment_coord['Longitude'][i])
        jail_dist = (jail_data['Lat'][p],jail_data['Long'][p])
        all_dist = dist(apart_dist,jail_dist).km
        jail_distance.append(all_dist)

# Split data
split_jail = splitDataFrameIntoSmaller(jail_distance,len(jail_data)) 

# Calculate minimum distance to jail
min_dist_jail = []
for i in range(0,len(split_jail)-1):
    minimum = min(split_jail[i])
    min_dist_jail.append(minimum)


## Metro data

In [525]:
# Get church data from github 
metro_data = pd.read_csv('https://raw.githubusercontent.com/thornoe/sds_2018/master/CPH/Data/metro.csv', sep = ';')/1000000

# Calculate distance from jail to each apartment
metro_distance = []
for i in range(0,len(apartment_coord)):
    for p in range(0,len(metro_data)):
        apart_dist = (apartment_coord['Latitude'][i],apartment_coord['Longitude'][i])
        metro_dist = (metro_data['lat'][p],metro_data['long'][p])
        all_dist = dist(apart_dist,metro_dist).km
        metro_distance.append(all_dist)

# Split data
split_metro = splitDataFrameIntoSmaller(metro_distance,len(metro_data)) 

# Calculate minimum distance to jail
min_dist_metro = []
for i in range(0,len(split_metro)-1):
    minimum = min(split_metro[i])
    min_dist_metro.append(minimum)