# Toronto Capstone Project

## Part 1 of Assignment

#### Importing and processing data

In [50]:
import requests
import lxml.html as lh
import bs4 as bs4
import urllib.request
import pandas as pd
import numpy as np 

In [42]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
link = requests.get(url)
soup = bs4.BeautifulSoup(link.content,'lxml')
table = soup.find_all('table')[0]
tor = pd.read_html(str(table))
toronto = pd.read_json(tor[0].to_json(orient='records'))

In [68]:
toronto = toronto[['Postal Code','Borough','Neighbourhood']]
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Getting the original size of the data set

In [69]:
toronto.shape[0]

180

#### Replacing Not Assigned values with NaN and dropping the NaNs

In [70]:
toronto.replace("Not assigned", np.nan, inplace = True)
toronto.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [71]:
toronto2 = toronto.dropna()

In [72]:
toronto2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [112]:
toronto2.shape

(103, 3)

#### Checking for duplicates in Postal Code in order to merge values of neighbourhoods

In [113]:
toronto2.duplicated(subset='Postal Code', keep='first')

2      False
3      False
4      False
5      False
6      False
8      False
9      False
11     False
12     False
13     False
14     False
17     False
18     False
20     False
21     False
22     False
23     False
26     False
27     False
30     False
31     False
32     False
36     False
39     False
40     False
41     False
45     False
46     False
47     False
48     False
49     False
50     False
54     False
55     False
56     False
57     False
58     False
59     False
63     False
64     False
65     False
66     False
67     False
68     False
72     False
73     False
74     False
75     False
76     False
77     False
80     False
81     False
82     False
83     False
84     False
85     False
86     False
89     False
90     False
91     False
92     False
93     False
94     False
95     False
98     False
99     False
100    False
102    False
103    False
104    False
107    False
108    False
109    False
111    False
112    False
113    False
114    False

#### Check for any not assigned values in the neighbourhood column to equal it to its borough

In [114]:
toronto2['Neighbourhood'].isnull().values.any()

False

#### The final size of the data set as there was no need to further merge neighborhood values

In [115]:
size = toronto2.shape
print('The row size of this clean data set is now: ',size[0], 'and there are',size[1],'columns.')

The row size of this clean data set is now:  103 and there are 3 columns.


# Part 2 of Assignment


#### Retrieving geo data

In [124]:
geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [125]:
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Creating the new dataframe and merging both data sets

In [126]:
neighborhoods = pd.DataFrame()
neighborhoods['Postal Code']=(toronto2['Postal Code'])
neighborhoods['Borough']=(toronto2['Borough'])
neighborhoods['Neighbourhood']=(toronto2['Neighbourhood'])

neighborhoods.head()
neighborhoods.shape

(103, 3)

In [127]:
merged_df = pd.merge(neighborhoods, geo, on='Postal Code')

#### Resulting merged data set

In [131]:
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [129]:
merged_df.shape

(103, 5)