In [264]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os.path
from os import path
from geopy.geocoders import Nominatim
import math
import time

In [270]:
#Function to extract Neighborhoods from Wikipedia
def extractFromWikipedia():
    
    url = "https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles"
    response = requests.get(url)
    html = response.text
    parsedHtml = BeautifulSoup(html, "html.parser")
    #Get all the links
    links = parsedHtml.findAll('a')
    #Extract the name of those links we are interested in
    nbhs = []
    for link in links:
        if str(link).find(', Los Angeles') > 0:
            neighborhood = link.text
            comma = neighborhood.find(', ')
            if comma > 0:
                neighborhood = neighborhood[0:comma]
            link = 'https://en.wikipedia.org'+link['href']
            nbhs.append([neighborhood, link])
            print(neighborhood)
    nb_df = pd.DataFrame(nbhs)    
    nb_df.rename(columns={0:'Neighborhood', 1:'Link'}, inplace=True)
    #Remove duplicates
    nb_df.drop_duplicates(subset="Neighborhood", keep = False, inplace = True)
    #Add geo columns
    nb_df['Lat']=''
    nb_df['Lon']=''
    #Save results
    saveToFile(nb_df)
    print('Data saved to LosAngeles_tmp.csv')
    return nb_df

In [271]:
def saveToFile(nb_df):
    #Save data to file
    nb_df.to_csv('LosAngeles_tmp.csv', index=False)

In [272]:
def getAllCoordinates(nb_df):
    count = 0
    for index, row in nb_df.iterrows():
        if math.isnan(row['Lat']):
            count = count + 1
            geo = getCoordinates(row['Neighborhood'])
            if geo != False:
                nb_df.at[index,'Lat'] = geo['lat']
                nb_df.at[index,'Lon'] = geo['lon']
            else:    
                nb_df.at[index,'Lat'] = -999
                nb_df.at[index,'Lon'] = -999
            #Save the file each time we get new coordinates (geo service fails frequently)
            saveToFile(nb_df)
            time.sleep(1)    
    return count       

In [273]:
def getCoordinates(neighborhood):
    geo = False
    print("Getting coordinates from " + neighborhood)
    address = neighborhood + ", Los Angeles"
    geolocator = Nominatim(user_agent="LosAngelesBrowser")
    location = geolocator.geocode(address)
    #print(location.address) 
    if location is None:
        print("Unable to get coordinates from "+address)
    else:    
        geo = {'lat':location.latitude, 'lon':location.longitude}
    return geo    
        

In [275]:
if path.exists('LosAngeles_tmp.csv'):
    print('Reading neighborhoods from file')
    nb_df = pd.read_csv('LosAngeles_tmp.csv')
else:
    print('Extracting neighborhoods from Wikipedia')
    nb_df = extractFromWikipedia()
if getAllCoordinates(nb_df) == 0:
    print('Complete!')
    nb_def_df = nb_df[nb_df['Lat'] != -999]
    nb_def_df.to_csv('LosAngeles.csv', index=False)

Reading neighborhoods from file
Complete!
