In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## Extraction of table from wiki

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url,header=0)[0]
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## Data wrangling/Cleaning

In [4]:
df_cleaned=df[df['Borough']!='Not assigned']
df_cleaned=df_cleaned.reset_index(drop=True)
df_cleaned['Neighborhood']=df_cleaned['Neighborhood'].str.replace('/',',')
df_cleaned.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [5]:
df_cleaned[ df_cleaned['Postalcode']=='M5A' ]

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"


In [6]:
df_cleaned.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### Explanation of the above code

1. I read the table from the wikipage using the read_table to get the first table in the page with suitable parameters.
2. All the entries/rows with 'Borough' as 'Not assigned' are removed
3. All the seperators('/') in the Neighbourhood column are replaced with (',') as per the instructions.
3. Checking if the Neighbourhoods with the same postalcode are entered into single column or not.

In [7]:
print('Shape of the cleaned dataframe/table is :',df_cleaned.shape)

Shape of the cleaned dataframe/table is : (103, 3)


### Meging the cleaned dataframe with the dataframe from the given csv file for getting co-ordinates

In [10]:
#pd.concat(df_cleaned['Postalcode'],geo_coords)
geo_coords=pd.read_csv('Geospatial_Coordinates.csv')


neighborhoods=pd.merge(left=df_cleaned, right=geo_coords, left_on='Postalcode', right_on='Postal Code')
neighborhoods=neighborhoods.drop(['Postal Code'],axis=1)
print(neighborhoods.shape)
neighborhoods.head()

(103, 5)


Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(len(neighborhoods['Borough'].unique()),neighborhoods.shape[0]))

The dataframe has 10 boroughs and 103 neighborhoods.
