# Capstone Project Week 3 Assignment - Toronto Segmentation and Clustering

# Assignment Part 1

### (1) Use Beautiful Soup to scrap data from Wiki Page

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
with urlopen(url) as html_file:
    soup = BeautifulSoup(html_file, 'lxml')

### Scrap the data and put into a dataframe
Converted "Not assigned" to np.nan for easy filtering later on

In [4]:
    table = soup.find('table', class_= 'wikitable sortable')
    row_marker = 0
    all_rows=[]
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        thisrow=[]
        for column in columns:
            val = column.get_text().strip()
            if  (val == 'Not assigned'):
                thisrow.append(np.nan)
            else:
                 thisrow.append(column.get_text().strip())
                    
        # If this td has data, append to the dataframe
        if  thisrow:
            all_rows.append(thisrow)
    df = pd.DataFrame(all_rows, columns =['Postal Code', 'Borough', 'Neighborhood'], dtype = str) 
    df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### (2) Clean the data in the dataframe
####  Remove rows with Borough Not Assigned  ( Row count before and after)

In [5]:
## Ignore rows with Borough Not Assigned
print ("Rows before drop : " + str(df.shape))
df.dropna(subset=['Borough'], inplace=True)
print ("Rows after drop : " + str(df.shape))

Rows before drop : (288, 3)
Rows after drop : (211, 3)


####  Row with unassgined neighborhood will be assigned borough (if the borough is assigned)  e.g. M7A

In [6]:
## Find rows with Neighborhood=NaN and Borough is not empty
df[(df['Neighborhood'].isnull())  &  (df['Borough'].notnull())] 

Unnamed: 0,Postal Code,Borough,Neighborhood
8,M7A,Queen's Park,


In [7]:
thisCondition=(df['Neighborhood'].isnull())  &  (df['Borough'].notnull())
rowsToChange = df[thisCondition]['Borough'].index.values.astype(int)
df.loc[rowsToChange,'Neighborhood'] = df.loc[rowsToChange]['Borough']

In [8]:
## Check value after change
df[df['Postal Code']=='M7A']

Unnamed: 0,Postal Code,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [9]:
####  Combine neighborhoods for same postal code  ( example M5A)
print ("Before combine neighborhood for same postal code")
df[df['Postal Code']=='M5A']

Before combine neighborhood for same postal code


Unnamed: 0,Postal Code,Borough,Neighborhood
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park


In [10]:
newdf=df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x.astype(str))).reset_index()

print ("After combine neighborhood for same postal code")
newdf[newdf['Postal Code']=='M5A']

After combine neighborhood for same postal code


Unnamed: 0,Postal Code,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


## Shape after cleaning data

In [11]:
newdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
newdf.shape

(103, 3)

# Assignment Part 2

In [None]:
### Goecoder didn't quite work, so getting data from the file

In [13]:
!wget -q -O 'geospatial.csv' http://cocl.us/Geospatial_data
print ("Data Downloaded")

Data Downloaded


In [15]:
df_location  = pd.read_csv('geospatial.csv')

In [16]:
df_location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [None]:
### Merge the Toronto Neighborhood dataframe with Location dataframe ( use left join)

In [17]:
df_merged = pd.merge(newdf, df_location, on='Postal Code', how='left')

In [19]:
df_merged.shape

(103, 5)

In [20]:
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
