In [25]:
import requests
import re
import json
from bs4 import BeautifulSoup


Y_MOVIE_URL = 'https://tw.movies.yahoo.com/movie_thisweek.html'

# 以下網址後面加上 "/id=MOVIE_ID" 即為該影片各項資訊
Y_INTRO_URL = 'https://tw.movies.yahoo.com/movieinfo_main.html'     # 詳細資訊
Y_PHOTO_URL = 'https://tw.movies.yahoo.com/movieinfo_photos.html'   # 劇照
Y_TIME_URL = 'https://tw.movies.yahoo.com/movietime_result.html'    # 時刻表


def get_web_page(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text


def get_movies(dom):
    soup = BeautifulSoup(dom, 'html5lib')
    movies = []
    rows = soup.find_all('div', 'release_info_text')
    for row in rows:
        movie = dict()
        movie['expectation'] = row.find('div', 'leveltext').span.text.strip()
        movie['ch_name'] = row.find('div', 'release_movie_name').a.text.strip()
        movie['eng_name'] = row.find('div', 'release_movie_name').find('div', 'en').a.text.strip()
        movie['movie_id'] = get_movie_id(row.find('div', 'release_movie_name').a['href'])
        movie['poster_url'] = row.parent.find_previous_sibling('div', 'release_foto').a.img['src']
        movie['release_date'] = get_date(row.find('div', 'release_movie_time').text)
        movie['intro'] = row.find('div', 'release_text').text.replace(u'詳全文', '').strip()
        trailer_a = row.find_next_sibling('div', 'release_btn color_btnbox').find_all('a')[1]
        movie['trailer_url'] = trailer_a['href'] if 'href' in trailer_a.attrs.keys() else ''
        movies.append(movie)
    return movies


def get_date(date_str):
    # e.g. "上映日期：2017-03-23" -> match.group(0): "2017-03-23"
    pattern = '\d+-\d+-\d+'
    match = re.search(pattern, date_str)
    if match is None:
        return date_str
    else:
        return match.group(0)


def get_movie_id(url):
    # 1. https://movies.yahoo.com.tw/movie_thisweek.html
    # 2. https://movies.yahoo.com.tw/movieinfo_main.html/id=6981
    #      -> match.group(0): "/id=6707"
    pattern = '/id=\d+'
    match = re.search(pattern, url)
    if match is None:
        return url
    else:
        return match.group(0).replace('/id=', '')


def get_trailer_url(url):
    # e.g., 'https://tw.rd.yahoo.com/referurl/movie/thisweek/trailer/*https://tw.movies.yahoo.com/video/美女與野獸-最終版預告-024340912.html'
    return url.split('*')[1]


def main():
    page = get_web_page(Y_MOVIE_URL)
    if page:
        movies = get_movies(page)
        for movie in movies:
            print(movie)
        with open('movie.json', 'w', encoding='utf-8') as f:
            json.dump(movies, f, indent=2, sort_keys=True, ensure_ascii=False)


if __name__ == '__main__':
    main()

{'expectation': '71%', 'ch_name': '星際叛將：歐西里斯之子', 'eng_name': 'The Osiris Child: Science Fiction Volume One', 'movie_id': '6985', 'poster_url': 'https://movies.yahoo.com.tw/x/r/w420/i/o/production/movies/December2017/zZtBpEFeIK4DdPP2yhfl-1984x2834.JPG', 'release_date': '2018-01-05', 'intro': "★《星際異攻隊2》《鋼鐵英雄》特效團隊，打造全新科幻史詩鉅作！\n\xa0\n★當《星際特工瓦雷諾：千星之城》強碰《越獄風雲》，激戰異星球！\n\xa0\n★“愛死這部片了！動作場面超屌、演技出色、特效更驚人” - Ain't It C...", 'trailer_url': 'https://movies.yahoo.com.tw/video/%E6%98%9F%E9%9A%9B%E5%8F%9B%E5%B0%87-%E6%AD%90%E8%A5%BF%E9%87%8C%E6%96%AF%E4%B9%8B%E5%AD%90-%E4%B8%AD%E6%96%87%E9%A0%90%E5%91%8A-130404298.html?movie_id=6985'}
{'expectation': '89%', 'ch_name': '最黑暗的時刻', 'eng_name': 'Darkest Hour', 'movie_id': '7015', 'poster_url': 'https://movies.yahoo.com.tw/x/r/w420/i/o/production/movies/December2017/A3q6bEXTwBI4zgcffBzo-5942x8800.JPG', 'release_date': '2018-01-05', 'intro': '一個讓人振奮和啟發的真實故事在第二次世界大戰前夕展開，在溫斯頓邱吉爾（奧斯卡金獎提名蓋瑞歐德曼 飾）成為英國首相之後的幾天內，就必須面對他最具爭議與挑戰性的考驗：該與德國納粹談判和平協定，亦或是堅守一個國家的立國理念，...', 't