# Scraping the data from wikipedia using BeautifulSoup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup

Wikipedia URL

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html=urlopen(url)

In [3]:
soup=BeautifulSoup(html,"lxml")
type(soup)

bs4.BeautifulSoup

In [4]:
title=soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [48]:
rows=soup.find_all("tr")


In [49]:
for row in rows:
    row_td = row.find_all('td')


In [51]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)

# Loading the Scraped data into a DataFrame

In [9]:
df = pd.DataFrame(list_rows)
df.head(10)

Unnamed: 0,0
0,[]
1,"[M1A, Not assigned, Not assigned\n]"
2,"[M2A, Not assigned, Not assigned\n]"
3,"[M3A, North York, Parkwoods\n]"
4,"[M4A, North York, Victoria Village\n]"
5,"[M5A, Downtown Toronto, Harbourfront\n]"
6,"[M5A, Downtown Toronto, Regent Park\n]"
7,"[M6A, North York, Lawrence Heights\n]"
8,"[M6A, North York, Lawrence Manor\n]"
9,"[M7A, Queen's Park, Not assigned\n]"


# Data Cleaning

In [10]:
df=df[0].str.split(",",expand=True)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,[],,,,,,,,,,...,,,,,,,,,,
1,[M1A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,[M2A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,[M3A,North York,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,[M4A,North York,Victoria Village\n],,,,,,,,...,,,,,,,,,,
5,[M5A,Downtown Toronto,Harbourfront\n],,,,,,,,...,,,,,,,,,,
6,[M5A,Downtown Toronto,Regent Park\n],,,,,,,,...,,,,,,,,,,
7,[M6A,North York,Lawrence Heights\n],,,,,,,,...,,,,,,,,,,
8,[M6A,North York,Lawrence Manor\n],,,,,,,,...,,,,,,,,,,
9,[M7A,Queen's Park,Not assigned\n],,,,,,,,...,,,,,,,,,,


In [11]:
df=df[[0 ,1 ,2]]

In [12]:
df.head()

Unnamed: 0,0,1,2
0,[],,
1,[M1A,Not assigned,Not assigned\n]
2,[M2A,Not assigned,Not assigned\n]
3,[M3A,North York,Parkwoods\n]
4,[M4A,North York,Victoria Village\n]


In [13]:
df=df.iloc[1:-5,:]
df.head()

Unnamed: 0,0,1,2
1,[M1A,Not assigned,Not assigned\n]
2,[M2A,Not assigned,Not assigned\n]
3,[M3A,North York,Parkwoods\n]
4,[M4A,North York,Victoria Village\n]
5,[M5A,Downtown Toronto,Harbourfront\n]


In [16]:
df.reset_index(drop=True).head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned\n]
1,M2A,Not assigned,Not assigned\n]
2,M3A,North York,Parkwoods\n]
3,M4A,North York,Victoria Village\n]
4,M5A,Downtown Toronto,Harbourfront\n]


In [15]:
df[0]=df[0].str.lstrip('[')
df.head(10)
df=df.reset_index(drop=True)

In [17]:
df[2]=df[2].apply(lambda x: x[:-2])

In [18]:
df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [19]:
df.columns=["PostalCode","Borough","Neighbourhood"]

In [20]:
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [21]:
df1=df[df["Borough"]!=" Not assigned"]

In [22]:
replacer=df1[df1["Neighbourhood"]== " Not assigned"]["Borough"].values

In [23]:
df1["Neighbourhood"].replace(" Not assigned",replacer[0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [24]:
(df1["Neighbourhood"]== " Not assigned").value_counts()

False    211
Name: Neighbourhood, dtype: int64

In [25]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [32]:
uni=df1["PostalCode"].unique()
uni

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

In [33]:
def filtering(df1,uni):
    bor=[]
    neigh=[]
    df2=pd.DataFrame()
    for col in df1.columns:
        df2[col]=uni
    for ps in uni:
        bor.append(df1[df1["PostalCode"]==ps]["Borough"].unique()[0])
        neigh.append(df1[df1["PostalCode"]==ps]["Neighbourhood"].values)
    df2["Borough"]=bor
    df2["Neighbourhood"]=neigh
    return df2

In [34]:
df2=filtering(df1,uni)
df2.shape

(103, 3)

In [35]:
df2.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,[ Parkwoods]
1,M4A,North York,[ Victoria Village]
2,M5A,Downtown Toronto,"[ Harbourfront, Regent Park]"
3,M6A,North York,"[ Lawrence Heights, Lawrence Manor]"
4,M7A,Queen's Park,[ Queen's Park]


In [36]:
df2.Neighbourhood=df2.Neighbourhood.apply(lambda x: ",".join(x))

In [37]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [38]:
df2=df2.sort_values("PostalCode").reset_index(drop=True)

In [39]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [40]:
df2.shape

(103, 3)

# Part II

# Loading Latitude and Longitude

In [43]:
cord=pd.read_csv(r"C:\Users\DEVIL\Downloads\Geospatial_Coordinates.csv")

In [44]:
cord.rename(columns={"Postal Code":"PostalCode"},inplace=True)
cord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [45]:
cord.shape

(103, 3)

In [46]:
df3=df2.merge(cord,on="PostalCode")

In [47]:
df3.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
