# 成為資料分析師 | Python 與資料科學應用

> 網頁資料擷取：隨堂練習參考解答

## 郭耀仁

In [None]:
import requests
from lxml import etree
from io import BytesIO
from bs4 import BeautifulSoup
import time
import random

## 隨堂練習：2019-2020 球季 NBA 有幾支球隊？

In [None]:
def number_of_nba_teams(request_url):
    """
    >>> number_of_nba_teams("http://data.nba.net/prod/v2/2019/teams.json")
    30
    """
    response = requests.get(request_url)
    response_json = response.json()
    teams = response_json["league"]["standard"]
    n_nba_teams = 0
    for t in teams:
        if t["isNBAFranchise"]:
            n_nba_teams += 1
    return n_nba_teams

## 隨堂練習：divName 為 Atlantic 與 Southwest 的球隊有哪些？

In [None]:
def find_atlantic_southwest_teams(request_url):
    """
    >>> atlantic_southwest_teams = number_of_nba_teams("http://data.nba.net/prod/v2/2019/teams.json")
    >>> atlantic_southwest_teams['Atlantic']
    ['Boston Celtics', 'Brooklyn Nets', 'New York Knicks', 'Philadelphia 76ers', 'Toronto Raptors']
    >>> atlantic_southwest_teams['Southwest']
    ['Dallas Mavericks', 'Houston Rockets', 'Memphis Grizzlies', 'New Orleans Pelicans', 'San Antonio Spurs']
    """
    response = requests.get(request_url)
    response_json = response.json()
    teams = response_json["league"]["standard"]
    team_dict = dict()
    for t in teams:
        div = t["divName"]
        full_name = t["fullName"]
        if div in team_dict:
            team_dict[div].append(full_name)
        else:
            team_dict[div] = [full_name]
    return team_dict

## 隨堂練習：擷取台北市所有 7-11 商店資訊

In [None]:
def get_tpe_711_stores(request_url):
    """
    >>> tpe_711_stores = get_tpe_711_stores("https://emap.pcsc.com.tw/EMapSDK.aspx")
    >>> tpe_711_stores["松山區"][0]
    {'POIID': '170945', 'POIName': '上弘', 'Longitude': 121.548287390895, 'Latitude': 25.056390968531797, 'Address': '台北市松山區敦化北路168號B2'}
    >>> tpe_711_stores["信義區"][0]
    {'POIID': '167651', 'POIName': '一零一', 'Longitude': 121.565077, 'Latitude': 25.033373, 'Address': '台北市信義區信義路五段7號35樓'}
    >>> tpe_711_stores["大安區"][0]
    {'POIID': '153319', 'POIName': '大台', 'Longitude': 121.53261437826, 'Latitude': 25.0179598345753, 'Address': '台北市大安區羅斯福路三段283巷14弄16號1樓'}
    """
    form_data = {
        "commandid": "GetTown",
        "cityid": "01"
    }
    response = requests.post(request_url, data=form_data)
    file = BytesIO(response.content)
    tree = etree.parse(file)
    town_names = [t.text for t in tree.xpath("//TownName")]
    tpe_711_stores = dict()
    for town in town_names:
        form_data = {
            "commandid": "SearchStore",
            "city": "台北市",
            "town": town
        }
        r = requests.post(request_url, data=form_data)
        f = BytesIO(r.content)
        tree = etree.parse(f)
        poi_ids = [t.text.strip() for t in tree.xpath("//POIID")]
        poi_names = [t.text for t in tree.xpath("//POIName")]
        lons = [float(t.text)/1000000 for t in tree.xpath("//X")]
        lats = [float(t.text)/1000000 for t in tree.xpath("//Y")]
        adds = [t.text for t in tree.xpath("//Address")]
        tpe_711_stores[town] = []
        for poi_id, poi_name, lon, lat, add in zip(poi_ids, poi_names, lons, lats, adds):
            store_info = {
                "POIID": poi_id,
                "POIName": poi_name,
                "Longitude": lon,
                "Latitude": lat,
                "Address": add
            }
            tpe_711_stores[town].append(store_info)
        time.sleep(random.randint(1, 5))
    return tpe_711_stores

## 隨堂練習：以 `requests` 搭配 `bs4` 擷取 [Avengers: Endgame (2019)](https://www.imdb.com/title/tt4154796) 的劇情類型

In [None]:
def find_endgame_genre(request_url):
    """
    >>> find_endgame_genre("https://www.imdb.com/title/tt4154796")
    ['Action', 'Adventure', 'Drama']
    """
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text)
    elems = soup.select(".subtext a")
    genre = [e.text for e in elems]
    genre.pop()
    return genre

## 隨堂練習：以 `requests` 搭配 `bs4` 擷取 [Avengers: Endgame (2019)](https://www.imdb.com/title/tt4154796) 的演員陣容

In [None]:
def find_endgame_cast(request_url):
    """
    >>> find_endgame_cast("https://www.imdb.com/title/tt4154796")
    ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly']
    """
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text)
    elems = soup.select(".primary_photo+ td a")
    cast = [e.text.strip() for e in elems]
    return cast

In [None]:
%load ../test_cases/test_cases_01.py