### Data Science Capstone Course

In [0]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [0]:
#lấy data từ wiki 
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [0]:
data

In [0]:
#chuyển data thành đối tượng object của gói beautifulsoup 
soup = BeautifulSoup(data,'html.parser')

In [0]:
soup

In [0]:
#Tìm tất cả các [hàng (dòng) thẻ 'tr':table row] - thẻ 'td':table data: là các cell
rows = soup.find('table').find_all('tr')
rows

In [0]:
row = rows[5]#truy xuất các dòng
cells=row.find_all('td')
cells[2]#truy xuất các cell

<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td>

In [0]:
postalCodeList = []
boroughList = []
neighborhoodList = []
for row in rows:
  cells =  row.find_all('td')
  if(len(cells)>0):
    postalCodeList.append(cells[0].text)
    boroughList.append(cells[1].text)
    neighborhoodList.append(cells[2].text.rstrip('\n')) # tranh dòng mới gần cell

In [0]:
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [0]:
#toronto_df.drop("Not assigned", axis=0)=>giữ lại những dòng trong bảng mà ở cột Borough != "Not assigned"
toronto_df_drop = toronto_df[toronto_df.Borough != "Not assigned"]
toronto_df_drop.reset_index(drop=True)

In [0]:
#nhóm 2 cột đầu
toronto_df_drop_groupby = toronto_df_drop.groupby(["PostalCode", "Borough"], as_index=False)


In [0]:
#gôp các giá trị ở cột Neighborhood đã nhóm 2 cột đầu
toronto_df_drop_groupby = toronto_df_drop_groupby.agg(lambda x: ", ".join(x))


In [0]:
toronto_df_drop_groupby.shape

(103, 3)

In [0]:
# mỗi Neighborhood="Not assigned", gán lại giá trị ở cột Borough tương ứng
for index, row in toronto_df_drop_groupby.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_drop_groupby.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [0]:
na_neigh_rows = toronto_df_drop_groupby.Neighborhood == toronto_df_drop_groupby.Borough

In [0]:
toronto_df_drop_groupby[na_neigh_rows]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [0]:
toronto_df_drop_groupby.shape

(103, 3)

In [0]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')


Coordinates downloaded!


In [0]:
coordinates = pd.read_csv('toronto_coordinates.csv')

In [89]:
print(coordinates.shape)
coordinates.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [90]:
# đổi tên cột "Postal Code": "PostalCode"
coordinates.rename(columns={"Postal Code": "PostalCode"},inplace=True)
coordinates.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Ghép các cột từ table này vào table khác
toronto_df_new = toronto_df_drop_groupby.merge(coordinates, on="PostalCode", how="left")

In [92]:
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [0]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]

In [98]:
check_list_df = pd.DataFrame(columns=column_names)
check_list_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude


In [0]:
for postcode in ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]:
  check_list_df = check_list_df.append(toronto_df_new[toronto_df_new.PostalCode==postcode], ignore_index=True)

In [100]:
check_list_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
