# Peer-graded Assignment:
# Segmenting and Clustering Neighborhoods in Toronto

### Importing the required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

### initializing the target webpage to scrape

In [2]:
source = requests.get('http://www.wikizeroo.net/index.php?q=aHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGlzdF9vZl9wb3N0YWxfY29kZXNfb2ZfQ2FuYWRhOl9N').text
soup = BeautifulSoup(source, 'lxml')

### Scraping the required table

In [3]:
areas = soup.find('table', class_ = "wikitable sortable")

### Scraping the web page into a dataframe

In [4]:
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)

# Defining the column of the dataframe
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])

for area in areas.find_all('tr'):
    TDs = area.find_all('td')
    if (len(TDs)==3):
        # print(tds)
        Postcode=TDs[0].text
        Borough=TDs[1].text
        Neighbourhood=TDs[2].text
        csv_writer.writerow([Postcode, Borough, Neighbourhood])
csv_file.close()

### Exploring the data

In [5]:
df=pd.read_csv('cms_scrape.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\r\n
1,M2A,Not assigned,Not assigned\r\n
2,M3A,North York,Parkwoods\r\n
3,M4A,North York,Victoria Village\r\n
4,M5A,Downtown Toronto,Harbourfront\r\n


In [6]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
284,M8Z,Etobicoke,Mimico NW\r\n
285,M8Z,Etobicoke,The Queensway West\r\n
286,M8Z,Etobicoke,Royal York South West\r\n
287,M8Z,Etobicoke,South of Bloor\r\n
288,M9Z,Not assigned,Not assigned\r\n


In [7]:
df.shape

(289, 3)

### Cleaning the data

In [8]:
df[df.columns] = df.apply(lambda x: x.str.strip('\r\n'))
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignoring the cells with a borough that is Not assigned.

In [9]:
df = df[df.Borough!='Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Compining the neighborhoods in each Borough

In [10]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
# Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma
df_agg = df.groupby(['Postcode', 'Borough']).agg(', '.join)

### Cleaning the boroughs with not assigned neighborhoods to be the same as the borough

In [11]:
# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough.
df_agg.loc[df_agg['Neighbourhood'] =='Not assigned']= df_agg[df_agg['Neighbourhood'] == 'Not assigned'].index.values[0][1]

In [12]:
df = df_agg.reset_index()

In [13]:
df[df['Borough'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [14]:
df.shape

(103, 3)

In [15]:
df.to_csv('Toronto.csv')