# Building the DataFrame from Wikipedia page

## Importing required libraries

In [19]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Getting the wiki page and creating a BS object

In [20]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki_page_bs = BeautifulSoup(wiki_page.content, 'html.parser')

## Getting the table that holds the data from the Wikipedia

In [21]:
table_with_data = wiki_page_bs.find('table')

In [22]:
df = pd.read_html(str(table_with_data))[0]

In [23]:
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


## Assigning appropriate column names

In [24]:
df.columns = df.iloc[0]

In [25]:
df_to_work = df.reindex(df.index.drop(0))

In [26]:
df_to_work.rename({'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True, axis=1)

In [27]:
df_to_work.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


## Dropping the Not assigned Boroughs

In [28]:
df_to_work = df_to_work.drop(df_to_work[(df_to_work['Borough']=='Not assigned')].index)

## Changing Neighborhood to Borough if Neighborhood is not assigned

In [29]:
df_to_work['Neighborhood'] = df_to_work.apply(lambda x: x['Borough'] if x['Neighborhood']=='Not assigned' else x['Neighborhood'],axis =1)

## Merging the Neighborhood with the same postal code

In [30]:
def join_with_comma(data):
    data['Neighborhood']=', '.join(map(str, data['Neighborhood'].values))
    return data

In [31]:
df_to_work = df_to_work.groupby('PostalCode').apply(join_with_comma).drop_duplicates().reset_index(drop=True)

In [32]:
df_to_work.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## Printing out the shape of final dataframe

In [33]:
df_to_work.shape

(103, 3)

## Reading in geospatial coordinates

In [36]:
df_gc = pd.read_csv('Geospatial_Coordinates.csv')
df_gc.rename({'Postal Code':'PostalCode'}, inplace=True, axis=1)

## Merging with previously built dataframe on PostalCode

In [37]:
df_full = df_to_work.merge(df_gc, on='PostalCode')

In [39]:
df_full.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [40]:
df_full.to_csv('df_with_ll.csv')

In [41]:
df_full.shape

(103, 5)