In [5]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time

In [6]:
def get_webpage_section(soup, tag, attr_type, val): # 
    # r = requests.get(url)
    # soup = BeautifulSoup(r.text, "html.parser")
    if attr_type == "id":
        result = soup.find(tag, id=val)
    elif attr_type == "class":
        result = soup.find_all(tag, class_=val)
    return result

In [7]:
def get_movies_url(menuUrl):
    moviePages = list()
    while True:
        r = requests.get(menuUrl)
        soup = BeautifulSoup(r.text, "html.parser")
        movieList = get_webpage_section(soup, "div", "class", "release_movie_name")
        for m in movieList:
            movieChiName = re.sub(r'\s', '', m.select("a")[0].text)
            # movieEngName = re.sub(r'\s', '', m.select("a")[1].text)
            movieUrl = m.select("a")[0]['href']
            moviePages.append([movieChiName, movieUrl])
            
        ### 掃每一頁
        nextPage = get_webpage_section(soup, "li", "class", "nexttxt")
        if nextPage != [] and nextPage[0].find("a") != None: # 若有下一頁, 更新 menuUrl 為下頁網址
            menuUrl = nextPage[0].find("a")['href']
        else: break
    return moviePages

In [22]:
def get_movies_info(movieInfo): # movieInfo: ['自殺突擊隊：集結', 'https://movies.yahoo.com.tw/movieinfo_main/%E8%87%AA%E6%AE%BA%E7%AA%81%E6%93%8A%E9%9A%8A-%E9%9B%86%E7%B5%90-the-suicide-squad-11328']
    r = requests.get(movieInfo[1])
    soup = BeautifulSoup(r.text, "html.parser")
    # English Name
    engNameSec = get_webpage_section(soup, "div", "class", "movie_intro_info_r")
    if engNameSec != [] and engNameSec != None:
        engName = engNameSec[0].find("h3").text
    else:
        engName = 'None'
    movieInfo.append(re.sub(r'\s', '', engName))
    
    # 分類
    movieClassSec = get_webpage_section(soup, "div", "class", "level_name_box")
    if movieClassSec != [] and movieClassSec != None:
        movieInfo.append([re.sub(r'\s', '', movieCls.text) for movieCls in movieClassSec[0].select("a")])
    else:
        movieInfo.append(['None'])
        
    # 上映日期
    movieRelTimeSec = get_webpage_section(soup, "div", "class", "movie_intro_info_r")
    if movieRelTimeSec != [] and movieRelTimeSec != None:
        movieInfo.append(movieRelTimeSec[0].find("span").text[5:]) # 上映日期：2021-08-05
    else:
        movieInfo.append('None')
    
    # 劇情介紹
    movieStory = get_webpage_section(soup, "span", "id", "story")
    if movieStory != [] and movieStory != None:
        movieInfo.append(re.sub(r'\s', '', movieStory.text))
    else:
        movieInfo.append('None')

    return movieInfo[:1] + movieInfo[2:]
    # print(movieClass)
    # input()

In [9]:
def pack_to_json(movieInfoList):
    movieJson = dict()
    for idx in range(len(movieInfoList)):
        movie_dict = dict()
        for jdx in range(len(movieInfoList[idx])):
            if jdx == 0: movie_dict['Chinese Name'] = movieInfoList[idx][jdx]
            elif jdx == 1: movie_dict['English Name'] = movieInfoList[idx][jdx]
            elif jdx == 2: movie_dict['Movie Category'] = movieInfoList[idx][jdx]
            elif jdx == 3: movie_dict['Release Date'] = movieInfoList[idx][jdx]
            elif jdx == 4: movie_dict['Story Intro'] = movieInfoList[idx][jdx]
        movieJson[ str(idx) ] = movie_dict
    return movieJson

In [10]:
def get_actors(movieList, ActorSet):
    ActorsUrlList = list()
    for movie in movieList: # movie: [Name, Url]
        if len(ActorSet) >= 3000:
            break
        r = requests.get(movie[1])
        soup = BeautifulSoup(r.text, "html.parser")
        # 演員 Url
        Actors = get_webpage_section(soup, "ul", "class", "starlist")
        if Actors != []:
            for actor in Actors[0].find_all("a"):
                if actor not in ActorSet:
                    ActorSet.add(actor)
                    ActorsUrlList.append(actor['href'])
    # print(ActorsUrlList)
    return ActorsUrlList

In [11]:
def get_movie_url_from_actors(actorsUrl, movieSet):
    moviePages = list()
    r = requests.get(actorsUrl)
    soup = BeautifulSoup(r.text, "html.parser")
    movieList = get_webpage_section(soup, "ul", "class", "trailer_list")
    for movie in movieList:
        for m in movie.select("a"):
            newMovie = [re.sub(r'\s', '', m.text), m['href']]
            if tuple(newMovie) not in movieSet:
                moviePages.append(newMovie)
                movieSet.add(tuple(newMovie))
    return moviePages

In [23]:
def main():
    start = time.time()
    r = requests.get("https://movies.yahoo.com.tw/")
    soup = BeautifulSoup(r.text, "html.parser")
    sel = get_webpage_section(soup, "ul", "id", "mainmenu").select("a")
    mainmenu = dict()
    for s in sel:
        for t in ['本週新片', '上映中', '即將上映']:
            if t in s.text: # 是以上種類的連結, 記錄在 mainmenu
                mainmenu[t] = s['href']
    movieList = list()
    for movieName, movieUrl in mainmenu.items():
        movieList += get_movies_url(movieUrl)
    movieSet = set(tuple(movie) for movie in movieList)
    movieInfoList = list()
    for m in movieList: # m: ['自殺突擊隊：集結', 'https://movies.yahoo.com.tw/movieinfo_main/%E8%87%AA%E6%AE%BA%E7%AA%81%E6%93%8A%E9%9A%8A-%E9%9B%86%E7%B5%90-the-suicide-squad-11328']
        movieInfoList.append(get_movies_info(m))
    movieJson = pack_to_json(movieInfoList)
    
    ActorSet = set()
    ### 擴張至演員作品第一層
    movieActorsUrlList = get_actors(movieList, ActorSet)
    movieUrlListFromActors = list() # 從演員作品抓到的電影網址
    for url in movieActorsUrlList: # movieActorsUrlList: 演員網址
        movieUrlListFromActors += get_movie_url_from_actors(url, movieSet)
        print(str(len(movieUrlListFromActors)))
    movieInfoListFromActors = list()
    for m in movieUrlListFromActors:
        movieInfoListFromActors.append(get_movies_info(m))
    ### 擴張至演員作品第二層
    movieActorsUrlList_2 = get_actors(movieUrlListFromActors, ActorSet)
    movieUrlListFromActors_2 = list() # 從演員作品抓到的電影網址
    print(len(ActorSet))
    for url in movieActorsUrlList_2: # movieActorsUrlList_2: 演員網址
        movieUrlListFromActors_2 += get_movie_url_from_actors(url, movieSet)
        print(str(len(movieUrlListFromActors_2)))
    movieInfoListFromActors_2 = list()
    for m in movieUrlListFromActors_2:
        movieInfoListFromActors_2.append(get_movies_info(m))
    
    file = open("movie_info.txt", "w", encoding="utf-8")
    movieJson = pack_to_json(movieInfoList + movieInfoListFromActors + movieInfoListFromActors_2)
    json.dump(movieJson, file, indent=4, ensure_ascii=False)
    file.close()
    
    end = time.time()
    print("Time: %f 秒" % (end - start))
main()

21
45
54
74
81
82
105
108
120
137
139
150
151
157
165
165
169
169
175
179
200
219
221
222
234
236
248
252
265
275
288
297
319
330
331
344
347
350
352
352
352
352
352
352
358
360
363
368
384
413
417
425
429
435
455
456
463
464
464
464
465
484
500
505
512
521
524
527
538
550
568
569
570
571
587
596
612
613
635
636
653
671
680
687
694
716
737
737
737
743
757
757
757
758
758
775
780
797
808
813
814
819
824
828
853
859
864
867
868
869
880
888
889
894
903
905
917
939
951
956
965
968
968
970
972
973
996
1006
1008
1013
1018
1024
1027
1029
1031
1034
1044
1067
1076
1079
1084
1085
1087
1090
1093
1093
1103
1112
1117
1120
1129
1140
1143
1146
1148
1166
1171
1186
1191
1191
1195
1195
1195
1198
1204
1209
1214
1218
1238
1261
1265
1265
1272
1274
1275
1286
1301
1302
1308
1326
1326
1331
1342
1344
1345
1345
1346
1348
1351
1352
1356
1365
1365
1365
1374
1397
1418
1420
1420
1423
1432
1444
1445
1447
1450
1451
1452
1459
1459
1459
1460
1465
1479
1485
1485
1485
1487
1490
1492
1514
1523
1523
1528
1529
1530
1534
153