<h4>Reading the table from wiki web-page:</h4>

In [1]:
import pandas as pd

toronto_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)

Toronto=toronto_data[0]

<h4>Looking at the result:</h4>

In [3]:
Toronto

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
print(Toronto.shape)

(180, 3)


<h4>Replacing the "Not assigned" meanings with no-values NaN :</h4>

In [10]:
import numpy as np
Toronto.replace("Not assigned", np.nan, inplace = True)

In [11]:
Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h4>Removing lines without values in the column "Borough":</h4>

In [12]:
Toronto.dropna(subset=["Borough"], axis=0, inplace=True)

In [13]:
print(Toronto.shape)

(103, 3)


In [14]:
Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h4>Groupping the data frame by Poctal Code in order to avoid duplications:</h4>

In [19]:
neighborhood=Toronto.groupby("Postal Code", axis=0).sum()

In [20]:
neighborhood

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
M1N,Scarborough,"Birch Cliff, Cliffside West"


In [21]:
neighborhood.reset_index("Postal Code", inplace=True)

<h4>Checking the result:</h4>

In [22]:
neighborhood

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


<h4>Check if we have missing data:</h4>

In [23]:
missing_data = neighborhood.isnull()
missing_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


<h4>We don't have missing data:</h4>

In [24]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



<h4>Method shape applied to my result:</h4>

In [25]:
print(neighborhood.shape)

(103, 3)


<h4>Reading geographical coordinates from csv-file:</h4>

In [30]:
geogr=pd.read_csv('https://cocl.us/Geospatial_data')

In [31]:
geogr.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4>Create new data frame that will containe combined information:</h4>

In [32]:
# define the dataframe columns
column_names = ['Postal Code','Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
Toronto_neighborhoods = pd.DataFrame(columns=column_names)

In [33]:
Toronto_neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


<h4>Merge first data frame "neighborhood" with geographical coordinates data frame "geogr":</h4>

In [39]:
Toronto_neighborhoods=neighborhood.merge(geogr)


<h4>Verify the result:</h4>

In [40]:
Toronto_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [41]:
Toronto_neighborhoods.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
102,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.706748,-79.594054


In [42]:
print(Toronto_neighborhoods.shape)

(103, 5)


<h4>Check if we have missing data:</h4>

In [47]:
missing_data = Toronto_neighborhoods.isnull()
missing_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


<h4>No missing data:</h4>

In [48]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Postal Code
False    103
Name: Postal Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64

Latitude
False    103
Name: Latitude, dtype: int64

Longitude
False    103
Name: Longitude, dtype: int64

