### PART 1: Scrap the Webpage "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" and create a cleaned dataframe

In [1]:
import pandas as pd # library for data analysis

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

In [3]:
# Get first table from the webpage into a DataFrame                                                                                                       
df = dfs[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [4]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

In [5]:
# instantiate the dataframe for cleaning
neighborhoods = pd.DataFrame(columns=column_names)

In [6]:
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


In [7]:
# Create a dataframe with assigned Borough only
print(len(df))
for i in range(len(df)):   
    if df['Borough'][i] != "Not assigned":
        neighborhoods = neighborhoods.append({'PostalCode': df['Postal Code'][i],
                                              'Borough': df['Borough'][i],
                                              'Neighborhood': df['Neighbourhood'][i]},
                                              ignore_index=True)

180


In [8]:
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
for i in range(len(neighborhoods)):
    if neighborhoods['Neighborhood'][i] == "Not assigned":
        print(i) # there are no Neighborhoods with the value "Not assigned" in the dataframe. Otherwise we would assign the same value as Borough using
        # neighborhoods['Neighborhood'][i] == neighborhoods['Borough'][i]

In [10]:
neighborhoods.shape

(103, 3)

### PART 2: Dataframe with latitude and the longitude coordinates of each neighborhood

In [11]:
# As Geocoder package could not produce reliable results, I'm using a csv file that has the geographical coordinates of each postal code
neighborhoods2 = pd.read_csv('http://cocl.us/Geospatial_data')

In [12]:
#Sort both dataframes the same way
neighborhoods2.sort_values(by='Postal Code', ascending=True)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [13]:
neighborhoods.sort_values(by='PostalCode', ascending=True)

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae
...,...,...,...
64,M9N,York,Weston
70,M9P,Etobicoke,Westmount
77,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
89,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [14]:
# Append latitude and longitude information to the neighborhoods dataframe

neighborhoods['Latitude']=neighborhoods2['Latitude']
neighborhoods['Longitude']=neighborhoods2['Longitude']
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437
