**This notebook generates Chicago neighborhoods data and its coordinates for the Final Capstone Project**

**(1) Imports**

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [4]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.



**(2) Scrape Wikipedia Link for Chicago Neighborhoods**  

In [51]:
page_link = 'https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago'
page_response = requests.get(page_link, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")

In [5]:
col_names = [c.text.replace('\n', '') for c in page_content.find_all("th")[0:2]] #There are 2 columns in total
df = pd.DataFrame(columns=col_names)

In [6]:
tbl_content = [c.text for c in page_content.find_all("td")]
for i in range(len(tbl_content)//2):
    if tbl_content[i*2] == '': # hack: the string after last line of the table is blank, thus one can use this to judge the end of the table
        break
    df.loc[len(df)] = [tbl_content[i*2].replace('\n', ''), tbl_content[i*2+1].replace('\n', '')]

**(3) Use Google Search to get Longitude and Latitude for each Neighborhood**

In [63]:
# Use Google Search to get the Coordinates
neighborhoods = []
community_areas = []
latitudes = []
longitudes = []
for i in range(len(df)):
    try:
        page_response = requests.get('https://www.google.com/search?ei=GTT3W6_kKY_vjwTlg66oDQ&q=%s+%s+chicago+coordinates'%(df['Neighborhood'].iloc[i].replace(' ','+'), df['Community area'].iloc[i].replace(' ','+')), timeout=5)
        page_content = BeautifulSoup(page_response.content, "html.parser")
        for link in page_content.find_all('a'):
            if link.attrs['href'].startswith('https://maps.google.com/maps?') and ('ll=' in link.attrs['href']):
                link_ll = link
                break
        (lat, lon) = [a for a in link_ll.attrs['href'].split('&') if a.startswith('ll=')][0].replace('ll=','').split(',')
        latitudes.extend([float(lat)])
        longitudes.extend([float(lon)])
        neighborhoods.extend([df['Neighborhood'].iloc[i]])
        community_areas.extend([df['Community area'].iloc[i]])
        print("Success for %s"%','.join([df['Neighborhood'].iloc[i], df['Community area'].iloc[i]]))
    except:
        latitudes.extend([None])
        longitudes.extend([None])
        neighborhoods.extend([df['Neighborhood'].iloc[i]])
        community_areas.extend([df['Community area'].iloc[i]])
        print("Fail for %s"%','.join([df['Neighborhood'].iloc[i], df['Community area'].iloc[i]]))
df_raw_google = pd.DataFrame({'Neighborhood': neighborhoods, 'Community area': community_areas, 'Latitude': latitudes, 'Longitude': longitudes})

Success for Albany Park,Albany Park
Success for Altgeld Gardens,Riverdale
Success for Andersonville,Edgewater
Success for Archer Heights,Archer Heights
Success for Armour Square,Armour Square
Success for Ashburn,Ashburn
Success for Ashburn Estates,Ashburn
Success for Auburn Gresham,Auburn Gresham
Success for Avalon Park,Avalon Park
Success for Avondale,Avondale
Fail for Avondale Gardens,Irving Park
Fail for Back of the Yards,New City
Fail for Belmont Central,Belmont Cragin
Fail for Belmont Gardens,Hermosa
Fail for Belmont Heights,Dunning
Fail for Belmont Terrace,Dunning
Success for Beverly,Beverly
Success for Beverly View,Ashburn
Success for Beverly Woods,Morgan Park
Success for Big Oaks,Norwood Park
Success for Boystown,Lake View
Success for Bowmanville,Lincoln Square
Success for Brainerd,Washington Heights
Success for Brickyard,Belmont Cragin
Success for Bridgeport,Bridgeport
Success for Brighton Park,Brighton Park
Success for Bronzeville,Douglas
Success for Bucktown,Logan Square
Suc

In [57]:
# Manually Fill those Failed cases
# For some of below areas, if we use page_response = requests.get('https://www.google.com/search?ei=GTT3W6_kKY_vjwTlg66oDQ&q=%s+chicago+coordinates'%(df['Neighborhood'].iloc[i].replace(' ','+')), timeout=5) instead, 
# i.e. remove Community area in the search, then we can get the results. For Avondale Garden, the results starts with "sll" instead of "ll"
idx = df_raw_google[(df_raw_google['Neighborhood']=='Avondale Gardens')&(df_raw_google['Community area']=='Irving Park')].index
df_raw_google.loc[idx, 'Latitude'] = 41.950931
df_raw_google.loc[idx, 'Longitude'] = -87.71768

idx = df_raw_google[(df_raw_google['Neighborhood']=='Back of the Yards')&(df_raw_google['Community area']=='New City')].index
df_raw_google.loc[idx, 'Latitude'] = 41.807533
df_raw_google.loc[idx, 'Longitude'] = -87.666163

idx = df_raw_google[(df_raw_google['Neighborhood']=='Belmont Central')&(df_raw_google['Community area']=='Belmont Cragin')].index
df_raw_google.loc[idx, 'Latitude'] = 41.9261544
df_raw_google.loc[idx, 'Longitude'] = -87.7777071

idx = df_raw_google[(df_raw_google['Neighborhood']=='Belmont Gardens')&(df_raw_google['Community area']=='Hermosa')].index
df_raw_google.loc[idx, 'Latitude'] = 41.9353935
df_raw_google.loc[idx, 'Longitude'] = -87.7296983

idx = df_raw_google[(df_raw_google['Neighborhood']=='Belmont Heights')&(df_raw_google['Community area']=='Dunning')].index
df_raw_google.loc[idx, 'Latitude'] = 41.9430001
df_raw_google.loc[idx, 'Longitude'] = -87.8177243

idx = df_raw_google[(df_raw_google['Neighborhood']=='Belmont Terrace')&(df_raw_google['Community area']=='Dunning')].index
df_raw_google.loc[idx, 'Latitude'] = 41.9414
df_raw_google.loc[idx, 'Longitude'] = -87.8329

idx = df_raw_google[(df_raw_google['Neighborhood']=='Lake View East')&(df_raw_google['Community area']=='Lake View')].index
df_raw_google.loc[idx, 'Latitude'] = 41.9413585
df_raw_google.loc[idx, 'Longitude'] = -87.6443089

In [120]:
df_raw_google.to_csv('Chicago Neighborhoods.csv')

In [13]:
# # We also tried to use Nominatim as below to get coordinates. However, it will fail for many cases. Therefore, we use Google Search instead as above
# geolocator = Nominatim()

# neighborhoods_fail = []
# community_areas_fail = []

# for i in range(len(df)):
#     try:
#         address = '%s, %s'%(df['Neighborhood'].iloc[i], df['Community area'].iloc[i])
#         location = geolocator.geocode(address)
#         latitudes.extend([location.latitude])
#         longitudes.extend([location.longitude])
#         neighborhoods.extend([df['Neighborhood'].iloc[i]])
#         community_areas.extend([df['Community area'].iloc[i]])
#         print("Success for %s"%address)
#     except: 
#         print("Fail for %s"%address)
#         latitudes.extend([None])
#         longitudes.extend([None])
#         neighborhoods.extend([df['Neighborhood'].iloc[i]])
#         community_areas.extend([df['Community area'].iloc[i]])
# #             pass



Success for Albany Park, Albany Park
Success for Altgeld Gardens, Riverdale
Success for Andersonville, Edgewater
Success for Archer Heights, Archer Heights
Fail for Armour Square, Armour Square
Success for Ashburn, Ashburn
Fail for Ashburn Estates, Ashburn
Fail for Auburn Gresham, Auburn Gresham
Success for Avalon Park, Avalon Park
Success for Avondale, Avondale
Fail for Avondale Gardens, Irving Park
Success for Back of the Yards, New City
Fail for Belmont Central, Belmont Cragin
Success for Belmont Gardens, Hermosa
Fail for Belmont Heights, Dunning
Success for Belmont Terrace, Dunning
Success for Beverly, Beverly
Success for Beverly View, Ashburn
Fail for Beverly Woods, Morgan Park
Fail for Big Oaks, Norwood Park
Success for Boystown, Lake View
Success for Bowmanville, Lincoln Square
Fail for Brainerd, Washington Heights
Fail for Brickyard, Belmont Cragin
Success for Bridgeport, Bridgeport
Success for Brighton Park, Brighton Park
Success for Bronzeville, Douglas
Fail for Bucktown, Log

**(4) Note that after above, it turns out we still need to manually clean/correct the latitude and longitude of the data we got from scraping Google Search weblink. The cleaned/corrected data is saved in "Chicago Neighborhoods_WZ.csv"**