**Segmenting and Clustering Neighborhoods in Toronto**


# Step 1


**Importing libraries**

In [218]:
import requests
import lxml.html as lh
import pandas as pd

*scrapping the data from webpage*

In [219]:
#getting the url 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#convert the data in webpage to work on it
page=requests.get(url)
doc=lh.fromstring(page.content)
tr_elements=doc.xpath('//tr')


*Storing header and creating columns to store the data*

In [220]:
col=[]
i=0
#iterating the tr_elements to get the header and create a columns
for t in tr_elements[0]:
  i+=1
  name=t.text_content()
  print ('%d:%s'%(i,name))
  col.append((name,[]))


1:Postal Code

2:Borough

3:Neighborhood



*preprocessing the data to convert it into dataframe*

In [221]:
#iterate to get data in col list
for j in range(1,len(tr_elements)):
  T=tr_elements[j]
  if(len(T)!=3):
    break;
  i=0
  for t in T.iterchildren():
    data=t.text_content() 
    try:
      data=int(data)
    except:
      pass
    col[i][1].append(data)
    i+=1


*Converting the list into dictionary and converting it into dataframe*

In [222]:
#converting the list into dicitonary
dic= {title:column for (title,column) in col} 
#converting the dictionary to dataframe
df=pd.DataFrame(dic)
df.head(5)


Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


*Cleaning the data*

In [223]:
#join the same pincode neighbours
df.columns = ['Postcode','Borough','Neighbourhood']
df=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,\n,Canadian postal codes\n,\n
1,M1A\n,Not assigned\n,Not assigned\n
2,M1B\n,Scarborough\n,"Malvern, Rouge\n"
3,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek\n"
4,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill\n"


In [224]:
#removing the space in from string in neighbourhood
df['Neighbourhood']=df['Neighbourhood'].str.strip()

In [225]:
#removing \n in the columns
df=df.replace('\n','',regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,Canadian postal codes,
1,M1A,Not assigned,Not assigned
2,M1B,Scarborough,"Malvern, Rouge"
3,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
4,M1E,Scarborough,"Guildwood, Morningside, West Hill"


In [226]:
#dropping the row where borough is not assinged
df.drop(index=df.index[df['Borough']=='Not assigned'],inplace=True)
df=df.reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,Canadian postal codes,
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


In [227]:
#dropping the empty postal code
df=df.drop(index=df.index[df['Postcode']==''])

In [228]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [229]:
#assigning the value of postcode to neighbourhood where neighbourhood is not assigned
df.loc[df['Neighbourhood']=='Not assigned','Neighbourhood']=df['Postcode']

In [230]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [231]:
df.shape

(103, 3)

# step 2


*Getting longitude and latitude data*

In [234]:
#read geographical data
df1=pd.read_csv('http://cocl.us/Geospatial_data')
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


*changing the column name*

In [239]:
#changing Postal Code to Postalcode
df1.columns=['Postcode','Latitude','Longitude']
df1.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [241]:
df_new=pd.merge(df,df1,on='Postcode')
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [244]:
df_new.to_csv('df_final.csv')