# 載入所需套件

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent import futures
from tqdm import tqdm
import os
from urllib.request import urlretrieve

# 定義爬取個別IMDb電影資訊細節函數

In [2]:
def details_crawler(link_list):
    link=link_list[0]
    i=link_list[1]
    r=requests.get(link)
    s=BeautifulSoup(r.text,'html.parser')
   
    details=s.select('div.heroic-overview')[0]
    
    return i,details

# 定義爬取IMDb排名電影資訊函數

In [3]:
def imdb_crawler(url):
    response=requests.get(url)
    soup=BeautifulSoup(response.text,'html.parser')

    dramas=soup.select('tbody>tr')
    ranks=[e.select('td.titleColumn')[0].text.replace('\n','').replace('  ','').split('.')[0] for e in dramas]
    titles=[e.select('td.titleColumn')[0].select('a')[0].text.replace(':','-') for e in dramas]
    years=[e.select('td.titleColumn')[0].select('span')[0].text for e in dramas]
    links=['https://www.imdb.com{}'.format(e.select('td.titleColumn')[0].select('a')[0]['href']) for e in dramas]
    points=[float(e.select('td.ratingColumn.imdbRating')[0].text.replace('\n','')) for e in dramas]

    summarys=[0]*len(links)
    directors=[0]*len(links)
    writers=[0]*len(links)
    stars=[0]*len(links)
    lengths=[0]*len(links)
    types=[0]*len(links)
    releases=[0]*len(links)
    imgs=[0]*len(links)
   
    link_list=[]
    for i in range(len(links)):
        link_list.append([links[i],i])


    #進入每個連結爬取(執行thread層級的非同步任務)
    with futures.ThreadPoolExecutor(max_workers=8) as executor:
        results=list(tqdm(executor.map(details_crawler,link_list),total=len(link_list)))

        for future in results:
            i,details=future
            directors[i]=details.select('div.credit_summary_item')[0].text.replace('\n','').replace('  ','').split(':')[1].split('|')[0]
            writers[i]=details.select('div.credit_summary_item')[1].text.replace('\n','').replace('  ','').split(':')[1].split('|')[0]
            stars[i]=details.select('div.credit_summary_item')[2].text.replace('\n','').split(':')[1].split('|')[0]
            summarys[i]=details.select('div.summary_text')[0].text.replace('\n','').replace('  ','')
            types[i]=details.select('div.subtext>a')[0].text
            releases[i]=details.select('div.subtext>a')[-1].text.replace('\n','')
            lengths[i]=details.select('time')[0].text.replace('\n','').replace('  ','')
            imgs[i]=details.select('img')[0]['src']

    df=pd.DataFrame({
        '排名':ranks,
        '片名':titles,
        '發行年份':years,
        '片長':lengths,
        '類型':types,
        '分數':points,
        '演員':stars,
        '導演':directors,
        '編劇':writers,
        '首映':releases,
        '簡介':summarys,
        '網址':links,
        '照片':imgs
    })

    df['發行年份']=df['發行年份'].str.extract('\((\d+)\)')
    
    return df

# 抓取IMDb電影排名資訊

In [4]:
df=imdb_crawler('https://www.imdb.com/chart/top?ref_=nv_mv_250')

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [01:48<00:00,  2.30it/s]


In [5]:
df.head()

Unnamed: 0,排名,片名,發行年份,片長,類型,分數,演員,導演,編劇,首映,簡介,網址,照片
0,1,The Shawshank Redemption,1994,2h 22min,Drama,9.2,"Tim Robbins, Morgan Freeman, Bob Gunton",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...",10 March 1995 (Taiwan),Two imprisoned men bond over a number of years...,https://www.imdb.com/title/tt0111161/,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,2,The Godfather,1972,2h 55min,Crime,9.1,"Marlon Brando, Al Pacino, James Caan",Francis Ford Coppola,"Mario Puzo (screenplay by), Francis Ford Coppo...",24 March 1972 (USA),The aging patriarch of an organized crime dyna...,https://www.imdb.com/title/tt0068646/,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,3,The Godfather- Part II,1974,3h 22min,Crime,9.0,"Al Pacino, Robert De Niro, Robert Duvall",Francis Ford Coppola,"Francis Ford Coppola (screenplay by), Mario Pu...",20 December 1974 (USA),The early life and career of Vito Corleone in ...,https://www.imdb.com/title/tt0071562/,https://m.media-amazon.com/images/M/MV5BMWMwMG...
3,4,The Dark Knight,2008,2h 32min,Action,9.0,"Christian Bale, Heath Ledger, Aaron Eckhart",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...",16 July 2008 (Taiwan),When the menace known as The Joker emerges fro...,https://www.imdb.com/title/tt0468569/,https://m.media-amazon.com/images/M/MV5BMTMxNT...
4,5,12 Angry Men,1957,1h 36min,Drama,8.9,"Henry Fonda, Lee J. Cobb, Martin Balsam",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)",10 April 1957 (USA),A jury holdout attempts to prevent a miscarria...,https://www.imdb.com/title/tt0050083/,https://m.media-amazon.com/images/M/MV5BMWU4N2...


# 利用分數條件篩選IMDb電影資訊

In [6]:
df[df['分數']>=9].tail()

Unnamed: 0,排名,片名,發行年份,片長,類型,分數,演員,導演,編劇,首映,簡介,網址,照片
0,1,The Shawshank Redemption,1994,2h 22min,Drama,9.2,"Tim Robbins, Morgan Freeman, Bob Gunton",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...",10 March 1995 (Taiwan),Two imprisoned men bond over a number of years...,https://www.imdb.com/title/tt0111161/,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,2,The Godfather,1972,2h 55min,Crime,9.1,"Marlon Brando, Al Pacino, James Caan",Francis Ford Coppola,"Mario Puzo (screenplay by), Francis Ford Coppo...",24 March 1972 (USA),The aging patriarch of an organized crime dyna...,https://www.imdb.com/title/tt0068646/,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,3,The Godfather- Part II,1974,3h 22min,Crime,9.0,"Al Pacino, Robert De Niro, Robert Duvall",Francis Ford Coppola,"Francis Ford Coppola (screenplay by), Mario Pu...",20 December 1974 (USA),The early life and career of Vito Corleone in ...,https://www.imdb.com/title/tt0071562/,https://m.media-amazon.com/images/M/MV5BMWMwMG...
3,4,The Dark Knight,2008,2h 32min,Action,9.0,"Christian Bale, Heath Ledger, Aaron Eckhart",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...",16 July 2008 (Taiwan),When the menace known as The Joker emerges fro...,https://www.imdb.com/title/tt0468569/,https://m.media-amazon.com/images/M/MV5BMTMxNT...


# 抓取IMDb電影排名照片

In [7]:
titles=df['片名'].values.tolist()
imgs=df['照片'].values.tolist()

directory='IMDb電影劇照'
if not os.path.isdir(directory):
    os.makedirs(directory)

#執行方便,以前10名照片為例
for title,img in zip(titles[:10],imgs[:10]):    
    print(title)
    urlretrieve(img,directory+'/{}.jpg'.format(title))

The Shawshank Redemption
The Godfather
The Godfather- Part II
The Dark Knight
12 Angry Men
Schindler's List
The Lord of the Rings- The Return of the King
Pulp Fiction
Huang hun san biao ke
Fight Club
