# Part 1 - Scraping Data from Wiki by Beautiful & Pandas

In [37]:
from bs4 import BeautifulSoup
import requests

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url).text
soup = BeautifulSoup(data, 'html.parser')
tables =soup.find('table')
table_contents=[]
table=soup.find('table')

for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)

In [89]:
import pandas as pd
df = pd.DataFrame(table_contents)
df = df.reindex(columns=['PostalCode','Borough','Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [68]:
df.shape

(103, 3)

In [67]:
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

# Import Latitude & Longitude

In [103]:
reading = pd.read_csv("/Users/wesley/Downloads/Geospatial_Coordinates.csv")
reading = reading.rename(columns = {"Postal Code":"PostalCode"})
reading.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [102]:
merge = pd.merge(df, reading, on = "PostalCode")
merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Explore and cluster the neighborhoods in Toronto

In [104]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/wesley/anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    ca-certificates-2020.12.5  |       h033912b_0         138 KB  conda-forge
    certifi-2020.12.5          |   py37hf985489_1         143 KB  conda-forge
    conda-4.10.1               |   py37hf985489_0         3.0 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1k             |       h0d85af4_0         1.9 MB  conda-forge
    python_abi-3.7             |          1_cp37m           4 KB  conda-forge
    vincent-0.4.4              |        

In [106]:
# pip install folium
import folium
mapping = merge[merge["Neighborhood"].str.contains("Toronto")]
mapping.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
36,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
73,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
80,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
88,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321


In [110]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
print('Libraries imported.')

Libraries imported.


In [107]:
!pip install geopy # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/0c/67/915668d0e286caa21a1da82a85ffe3d20528ec7212777b43ccd027d94023/geopy-2.1.0-py3-none-any.whl (112kB)
[K    100% |████████████████████████████████| 112kB 821kB/s ta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.1.0
The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [813]:
# create map of New York using latitude and longitude values
map_can = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(mapping['Latitude'], mapping['Longitude'], mapping['Borough'], mapping['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_can)  
    
map_can


In [1035]:
url = "/Users/wesley/Downloads/臺北捷運車站出入口座標.geojson"
import json

with open(url, "r") as read_file:
    data = json.load(read_file)
    data1 = data["features"]

names = []
latitudes = []
longtitudes = []

for i in data1:
    name = i["properties"]["出入口名稱"]
    latitude = i["properties"]["緯度"]
    longtitude = i["properties"]["經度"]
    names.insert(0,name)
    latitudes.insert(0,latitude)
    longtitudes.insert(0,longtitude)
names = pd.DataFrame(names, columns=['Names'])
latitudes = pd.DataFrame(latitudes, columns=['Latitudes'])
longtitudes = pd.DataFrame(longtitudes, columns=['Longtitudes'])
df = pd.concat([names,latitudes,longtitudes],axis=1)
df.head(50)


Unnamed: 0,Names,Latitudes,Longtitudes
0,新北產業園區站出口,25.061548,121.459926
1,幸福站出口2,25.049545,121.460593
2,幸福站出口1,25.050126,121.460146
3,新埔民生站出口,25.026125,121.466839
4,板橋站出口5,25.015502,121.464312
5,板橋站出口4,25.015105,121.464938
6,板新站出口,25.014491,121.472212
7,中原站出口,25.00841,121.484159
8,橋和站出口,25.004413,121.490461
9,中和站出口,25.001981,121.496052


In [1030]:
# pd.DataFrame.from_records(element)

items = []
for item in element:
    items.insert(0,item)

test = items[0]["items"]

for z in test:
    print(z["venue"]["name"],end = "\n")
    print(z["venue"]["location"]["lat"], 
          z["venue"]["location"]["lng"],end = "\n")    
    print(z["venue"]["categories"][0]["name"],end = "\n"+ "\n")  #類別

Starbucks (星巴克)
24.947804803639915 121.3797005513654
Coffee Shop

RT-Mart (大潤發)
24.963831659246708 121.43333496060222
Shopping Mall

Carrefour (家樂福土城福益店)
24.983403055110664 121.45835235824546
Supermarket

TRA Shulin Station (臺鐵樹林車站)
24.99143224005543 121.42456077548228
Train Station

山泉水手工豆花
24.934984722860726 121.37279130195732
Dessert Shop

摩斯漢堡 MOS Burger
24.946252259039774 121.38279075368085
Fast Food Restaurant

Starbucks (星巴克)
24.988516671458232 121.42204382108581
Coffee Shop

誠品書店 Eslite Bookstore
24.99702702799921 121.45265579223631
Bookstore

阿城鵝肉 土城總店
24.981874821003018 121.4569265697076
Chinese Restaurant

厚切牛排
24.99521436078072 121.45150008206743
Steakhouse

Amo Cafe 阿默輕食蛋糕
24.964896455931804 121.43826633378494
Café

星巴克
24.986246 121.464665
Coffee Shop

STARBUCKS (星巴克)
24.978663 121.444332
Coffee Shop

Starbucks (星巴克)
24.94434837382297 121.37744450286719
Coffee Shop

7-Eleven
24.96769282304793 121.43737909313374
Convenience Store

7-ELEVEN 承天門市
24.96668274251929 121.436034

In [1064]:
opendata = "/Users/wesley/Downloads/opendata108N010.csv"
read_opendata = pd.read_csv(opendata)
read_opendata


Unnamed: 0,site_id,people_total,area,population_density,Latitudes,Longtitudes
0,台北市松山區,204193.0,9.29,21985.0,25.0542,121.5639
1,台北市信義區,220021.0,11.21,19631.0,25.0409,121.572
2,台北市大安區,307631.0,11.36,27077.0,25.0249,121.5434
3,台北市中山區,227387.0,13.68,16619.0,25.0792,121.5427
4,台北市中正區,158014.0,7.61,20772.0,25.0421,121.5199
5,台北市大同區,126043.0,5.68,22185.0,25.0627,121.5113
6,台北市萬華區,187076.0,8.85,21133.0,25.0263,121.497
7,台北市文山區,271806.0,31.51,8626.0,24.9929,121.5713
8,台北市南港區,120297.0,21.84,5507.0,25.0312,121.6112
9,台北市內湖區,285795.0,31.58,9050.0,25.0689,121.5909


In [1067]:
m = folium.Map((25.0478, 121.5170),zoom_start=12)

for a , b in zip(latitudes["Latitudes"], longtitudes["Longtitudes"]) : #重要！用zip封裝

    folium.CircleMarker(
        location=[a,b],
        radius=20,
        popup='MRT System in Taipei',
        color='#078ccc',
        fill=True,
        fill_color='#078ccc'
    ).add_to(m)

    
folium.CircleMarker(
    location=[25.0478, 121.5170],
    radius=150,
    popup='Center of the City',
    color='#FFFFFF',
    fill=True,
    fill_color='#FF60AF'
).add_to(m)


for z in test:
    folium.CircleMarker(
        location=[z["venue"]["location"]["lat"], z["venue"]["location"]["lng"]],
        radius=10,
        popup='z["venue"]["name"]',
        color='#9F4D95',
        fill=True,
        fill_color='#9F4D95'
    ).add_to(m)    
    

for d, e, f, g in zip(read_opendata["Latitudes"], read_opendata["Longtitudes"], read_opendata["people_total"], read_opendata["site_id"]):
    folium.CircleMarker(
        location=[d, e],
        radius= f/10000,
        popup='g',
        color='#FF60AF',
        fill=True,
        fill_color='#FF60AF'
    ).add_to(m)        
    

m

In [1039]:
url1

# df["Latitudes"]
# df["Longtitudes"]
# url2 = []


# for a, b in zip(df["Latitudes"], df["Longtitudes"]):   
#     url1 = "https://api.foursquare.com/v2/venues/explore?client_id=LBMBRR20GAZWHOZLARRAQKJCQRN4BZAJ4CDGY14WTJDAZH4N&client_secret=SBSQAXXFXQOGLULGX22BX0CGY4GAJZZ4CH4A1FW3KYGDAVNH&v=20170511&ll=" + str(a) + "," + str(b)
#     url2.insert(0, url1)
    

# import requests # 导入网页请求库
# from bs4 import BeautifulSoup # 导入网页解析库
# import re
    
# url3 = []
# ii = len(url2)
# for iii in range(ii):
#     uu = requests.get(url2[iii])
#     soup = BeautifulSoup(uu.text,'html.parser')
#     # ele = soup.find('pre')
#     print(soup)

    
# #     url3.insert(0, url2[iii])


# # import json
# # with open(url3[1], "r") as read_file1:
# #     data55 = json.load(read_file1)
# #     print(data55)
    
    
#     #     data1 = data["features"]


# # reading1 = []
# # for iiii in url3:  
# #     reading = pd.read_json(iiii)


# # reading1 = []
# # for iiii in url3:
# #     reading = pd.read_json(iiii)
    
    
    
# #     reading1.insert(0, reading)
    
    
# # print(reading1)

# #     reading1.insert(0, reading)
# # reading1
    

# # for i in url3:
# #     reading = pd.read_json(url3[i])
# #     print(reading)

    
    
# #     element = reading["response"].loc["groups"]
# #     element1 = pd.read_json(element) 




'https://api.foursquare.com/v2/venues/explore?client_id=LBMBRR20GAZWHOZLARRAQKJCQRN4BZAJ4CDGY14WTJDAZH4N&client_secret=SBSQAXXFXQOGLULGX22BX0CGY4GAJZZ4CH4A1FW3KYGDAVNH&v=20170511&ll=25.0144914,121.4722124'