# This Notebook will be used for Toronto Assignment of Capstone Project on Coursera

## 1. Creating the Pandas Data Frame with the Required Data

In [3]:
#Importing libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [26]:
#scapping wiki pange to get the required table
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page=requests.get(url).text

soup= BeautifulSoup(wiki_page,'html.parser')
req_table=soup.find('table',class_="wikitable sortable")
req_table.findAll('th')

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3B
</td>
<td>

In [47]:
#parsing the html to store table columns in a list
PC=[]
Borough=[]
Neighborhood=[]

for data in req_table.findAll('tr'):
    values= data.findAll('td')
    if len(values):
        PC.append(values[0].find(text=True).strip())
        Borough.append(values[1].find(text=True).strip())
        Neighborhood.append(values[2].find(text=True).strip())
        

In [55]:
#saving the table data in dataframe from list
postal_df=pd.DataFrame()
postal_df['PostalCode']=PC
postal_df['Borough']=Borough
postal_df['Neighborhood']=Neighborhood
postal_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [58]:
# removing rows containing 'Non Asigned' data
postal_df=postal_df[postal_df['Borough']!='Not assigned'].reset_index(drop='True')

In [59]:
# checking if any Neighborhood are still 'Not Asigned'
postal_df[postal_df['Neighborhood']=='Not assigned'].reset_index(drop='True')

Unnamed: 0,PostalCode,Borough,Neighborhood


In [61]:
#Print the shape of df
postal_df.shape

(103, 3)

In [64]:
#print df head
postal_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## 2. Adding Lat Lon columns to the Pandas Data Frame

In [67]:
import geocoder # import geocoder

In [95]:
# inserting the given code in a function (limit of 50 requests per postal code was done to check the viability of the soln)
def getLatLon(row):
#     print(row)
    postal_code=row.PostalCode
    print(postal_code)
    # initialize your variable to None
    lat_lng_coords = None
    tries=0
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng
      tries+=1
      if tries % 50 ==0:
            print("Tried "+str(tries)+" times")
            return "Not found -- Exiting"
    print(lat_lng_coords,tries)
    
    return lat_lng_coords
#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]

In [96]:
# calling the function to check how long it takes to get lat lon for a postal code
postal_df.apply(getLatLon, axis=1)

M3A
Tried 50 times
Tried 100 times
Tried 150 times
Tried 200 times
Tried 250 times


KeyboardInterrupt: 

In [None]:
### Since it takes too long to get values for any postal code , we dwnload the given csv

In [97]:
# reading the downloaded csv
latLon_df=pd.read_csv(r'C:\Users\eutsaga\Downloads\Geospatial_Coordinates.csv')
latLon_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [101]:
#merging the new df with the existing df to create a new df
postal_df2=pd.merge(postal_df,latLon_df, left_on='PostalCode',right_on='Postal Code').drop('Postal Code',axis=1)
postal_df2.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
