# Segmenting and Clustering Neighborhoods in Toronto - Part-1

## Scraping wikipedia page and and transforming the data on wikipedia page into pandas dataframe

#### Importing Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as req

#### Sending a request using urlib to wiki page

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html=req.urlopen(url).read()

####  Using beautiful soup for scraping the wikipedia page to fetch the data

In [3]:
soup=BeautifulSoup(html,'html.parser')

FINDING THE TABLE

In [4]:
table=soup.find('table',class_='wikitable sortable')

Storing data of each column into different list

In [5]:
pc=[]
b=[]
neigh=[]
for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        pc.append(cells[0].find(text=True))
        b.append(cells[1].find(text=True))
        neigh.append(cells[2].find(text=True))

## converting data into dataframe 

In [6]:
df=pd.DataFrame({'Postalcode':pc,'Borough':b,'Neighborhood':neigh})

In [7]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [8]:
df['Postalcode']=df['Postalcode'].str.replace('\n','')
df['Borough']=df['Borough'].str.replace('\n','')
df['Neighborhood']=df['Neighborhood'].str.replace('\n','')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


finding total number of rows with "Not assigned" in Borough

In [9]:
(df['Borough']=='Not assigned').sum()

77

#### removing rows with "Not assigned" in Borough column

In [10]:
df=df[df['Borough']!='Not assigned']

checking

In [11]:
(df['Borough']=='Not assigned').sum()

0

## Grouping of multiple Neighborhoods having the same postal code

In [12]:
df.groupby(['Postalcode', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [13]:
toronto_grp = df.groupby(['Postalcode', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))
toronto_grp['Neighborhood'] = toronto_grp['Neighborhood'].str.replace('/', ',')

In [14]:
toronto_grp.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
toronto_grp.shape

(103, 3)

In [16]:
toronto_grp.to_csv('Toronto_1.csv', index = False)