# WEB SCRAPING
## INTRO
### 웹페이지 유형

정적(static) 웹페이지: 저장되어있는 html, css을 그대로 사용하여 렌더링이 항상 일정한 페이지 (e.g. 소개 페이지) \
동적(dynamic) 웹페이지: 클라이언트의 요청, 외부 변수에 따라 다르게 렌더링되는 페이지 (e.g. 마이 페이지)

### 크롤링 유형

||정적 크롤링|동적 크롤링|
|---|---|---|
|정보 수집 라이브러리|requests, urllib|selenium|
|html 파싱 라이브러리|bs4|bs4, selenium|
|페이지 조작|X|O|
|속도|빠름|느림||

## PRACTICE

### LIBRARY

In [27]:
!pip install joblib
!pip install bs4
!pip install selenium
!pip install requests
!pip install pandas
!pip install tqdm



In [28]:
import itertools
import datetime
import re
import time
import requests as rq
import pandas as pd
import numpy as np
import joblib as jl
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

### EXAMPLE 1 
네이버 영화 정보 (정적 크롤링)

#### 영화 랭킹 정보를 담고 있는 페이지 URL 준비

In [29]:
DATE = datetime.datetime.today().strftime('%Y%m%d')
NUM_PAGES_RANKING = 5
NUM_PAGES_COMMENT = 3

list_rank_page_url = []
for i in range(1, NUM_PAGES_RANKING + 1):
    url = f'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date={DATE}&page={i}'
    list_rank_page_url.append(url)

In [30]:
list_rank_page_url

['https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20220502&page=1',
 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20220502&page=2',
 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20220502&page=3',
 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20220502&page=4',
 'https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&date=20220502&page=5']

#### 각 영화 페이지 URL 수집

In [31]:
def get_id_and_text(row):
    url = row.get_attribute_list('href')[0]
    if url == '#':
        onclick = row.get_attribute_list('onclick')[0]
        id = re.findall(r'\d{8}', onclick)[0]
    elif url == None:
        id = np.nan
    else:
        id = re.search(r'(?<=code=).*', url).group()
    title = row.text.strip()
    return (id, title)

list_total_movie = []
for page_url in tqdm(list_rank_page_url):
    response = rq.get(page_url)
    
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        list_row = soup.select('#old_content > table > tbody > tr > td.title > div > a')
        list_movie = list(map(lambda row: get_id_and_text(row), list_row))
        list_total_movie.extend(list_movie)
    else:
        print('error:', page_url)

df_id_title = pd.DataFrame(list_total_movie, columns=['id_movie', 'title_movie'])

  0%|          | 0/5 [00:00<?, ?it/s]

In [32]:
df_id_title

Unnamed: 0,id_movie,title_movie
0,191613,클라우스
1,171539,그린 북
2,174830,가버나움
3,186114,밥정
4,213746,장민호 드라마 최종회
...,...,...
245,10173,사랑과 영혼
246,36666,태극기 휘날리며
247,10018,이티
248,172980,12번째 솔저


#### 영화 정보 수집


In [33]:
def extract_text(list_tag):
    result = [tag.text.strip() for tag in list_tag]
    return result

def concat(list_tag):
    if list_tag == []:
        result = np.nan
    else:
        concat = ''.join(extract_text(list_tag))
        result = concat.strip()
    return result

def scrape_basic(id):
    page_url = f'https://movie.naver.com/movie/bi/mi/basic.naver?code={id}'
    response = rq.get(page_url)
    
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        
        title_kor = soup.select_one('#content > div.article > div.mv_info_area > div.mv_info > h3 > a').text
        title_eng = soup.select_one('#content > div.article > div.mv_info_area > div.mv_info > strong').text[:-6]
        
        score_audience = float(concat(soup.select('#actualPointPersentBasic > div > em')))
        score_critic = float(concat(soup.select('#content > div.article > div.mv_info_area > div.mv_info > div.main_score > div > div > a > div > em')))
        score_netizen = float(concat(soup.select('#pointNetizenPersentBasic > em')))

        # extract_text(soup.select('#content > div.article > div.mv_info_area > div.mv_info > dl > dd > p > span > a')) # bad case
        basic_info = soup.select('#content > div.article > div.mv_info_area > div.mv_info > dl > dd > p > span')
        genre = extract_text(basic_info[0].select('a'))
        nation = extract_text(basic_info[1].select('a'))
        run_time = basic_info[2].text.strip()
        try:
            href = basic_info[3].select('a')[-1].get_attribute_list('href')[0]
            day_opening = re.search(r'(?<=open=)\d{8}', href).group()
        except:
            day_opening = '00000000'
        
        info = [id, title_kor, title_eng, score_audience, score_critic, score_netizen, genre, nation, run_time, day_opening]
        return info
    else:
        print('error:', page_url)
        return

list_movie_basic = jl.Parallel(n_jobs=-1, prefer="threads")(jl.delayed(scrape_basic)(id) for id, title in tqdm(list_total_movie))

df_movie_basic = pd.DataFrame(list_movie_basic, columns=['id_movie', 'title_kor', 'title_eng', 'score_audience', 'score_critic', 'score_netizen', 'genre', 'nation', 'run_time', 'day_opening'])

  0%|          | 0/250 [00:00<?, ?it/s]

In [34]:
df_movie_basic.to_excel('df_movie_basic.xlsx')
df_movie_basic

Unnamed: 0,id_movie,title_kor,title_eng,score_audience,score_critic,score_netizen,genre,nation,run_time,day_opening
0,191613,클라우스,Klaus,,,9.81,"[애니메이션, 코미디, 가족]","[스페인, 영국]",96분,20191115
1,171539,그린 북,Green Book,9.55,7.29,9.60,[드라마],[미국],130분,20190109
2,174830,가버나움,"Capharnaum, Capernaum",9.54,7.33,9.59,[드라마],"[레바논, 프랑스]",126분,20190124
3,186114,밥정,The Wandering Chef,9.70,7.00,9.56,"[다큐멘터리, 드라마]",[한국],82분,20201007
4,213746,장민호 드라마 최종회,,9.89,,9.55,[공연실황],[한국],106분,20220124
...,...,...,...,...,...,...,...,...,...,...
245,10173,사랑과 영혼,Ghost,,6.00,9.23,"[멜로/로맨스, 드라마]",[미국],127분,19901124
246,36666,태극기 휘날리며,TaeGukGi: Brotherhood Of War,,,9.21,"[전쟁, 드라마, 액션]",[한국],145분,20040205
247,10018,이티,"The Extra-Terrestrial, E.T.",,9.50,9.22,"[SF, 판타지, 가족, 모험]",[미국],110분,19840623
248,172980,12번째 솔저,"Den 12. mann, The 12th Man",9.50,7.00,9.24,[전쟁],[노르웨이],135분,20190411


#### 배우/감독 정보 수집


In [35]:
def scrape_detail(id_movie):
    page_url = f'https://movie.naver.com/movie/bi/mi/detail.naver?code={id_movie}'
    response = rq.get(page_url)
    
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        
        list_actor = soup.select('#content > div.article > div.section_group.section_group_frst > div.obj_section.noline > div > div.lst_people_area.height100 > ul > li > div')
        list_actor_info = []
        for i in range(len(list_actor)):
            id_actor, name_kor = get_id_and_text(list_actor[i].find(class_='k_name'))
            name_eng = list_actor[i].find(class_='e_name').text
            part = list_actor[i].find(class_='p_part').text
            info_temp = [id_movie, id_actor, name_kor, name_eng, part]
            list_actor_info.append(info_temp)
        
        list_director = soup.select('#content > div.article > div.section_group.section_group_frst > div > div > div.dir_obj > div')
        list_director_info = []
        for i in range(len(list_director)):
            id_director, name_kor = get_id_and_text(list_director[i].find(class_='k_name'))
            name_eng  = list_director[i].find(class_='e_name').text
            info_temp = [id_movie, id_director, name_kor, name_eng]
            list_director_info.append(info_temp)
        
        return list_actor_info, list_director_info
    else:
        print('error:', page_url)
        return

list_movie_detail = jl.Parallel(n_jobs=-1, prefer="threads")(jl.delayed(scrape_detail)(id) for id, title in tqdm(list_total_movie))

list_actor, list_director = np.array(list_movie_detail).T
list_actor = list(itertools.chain.from_iterable(list_actor))
list_director = list(itertools.chain.from_iterable(list_director))

df_actor = pd.DataFrame(list_actor, columns=['id_movie', 'id_actor', 'name_kor', 'name_eng', 'part'])
df_director = pd.DataFrame(list_director, columns=['id_movie', 'id_actor', 'name_kor', 'name_eng'])

  0%|          | 0/250 [00:00<?, ?it/s]

  list_actor, list_director = np.array(list_movie_detail).T


In [36]:
df_actor.to_excel('df_actor.xlsx')
df_actor

Unnamed: 0,id_movie,id_actor,name_kor,name_eng,part
0,191613,4688,제이슨 슈왈츠먼,Jason Schwartzman,주연
1,191613,4651,J.K. 시몬스,J.K. Simmons,주연
2,191613,50339,라시다 존스,Rashida Jones,주연
3,191613,144698,윌 사쏘,Will Sasso,조연
4,191613,444550,네다 마르그레테 라바,Neda Margrethe Labba,조연
...,...,...,...,...,...
2577,190244,368851,조세핀 자피,Josephine Japy,주연
2578,190244,377233,벤자민 라베른헤,Benjamin Lavernhe,조연
2579,190244,344748,까밀리에 를르쉐,Camille Lellouche,조연
2580,190244,407140,아마우리 드 크레양쿠르,Amaury de Crayencour,조연


In [37]:
df_director.to_excel('df_director.xlsx')
df_director

Unnamed: 0,id_movie,id_actor,name_kor,name_eng
0,191613,180101,서지오 파블로스,Sergio Pablos
1,191613,435871,카를로스 마르티네즈 로페즈,Carlos Martinez Lopez
2,171539,2049,피터 패럴리,Peter Farrelly
3,174830,130535,나딘 라바키,Nadine Labaki
4,186114,429911,박혜령,Park Hye-ryoung
...,...,...,...,...
277,10173,6067,제리 주커,Jerry Zucker
278,36666,2369,강제규,Kang Je Kyu
279,10018,1495,스티븐 스필버그,Steven Spielberg
280,172980,531,해럴드 즈워트,Harald Zwart


#### 평점/리뷰 수집

In [38]:
def extract_comment(list_tag):
    result = []
    for tag in list_tag:
        if tag.select_one('span > a'):
            tag = tag.select_one('span > a')
            text = tag.get_attribute_list('data-src')[0].strip()
        else:
            text = tag.text.strip()
        result.append(text)
    return result

def scrape_comment(id_movie, num_of_pages=10):
    list_score = []
    list_comment = []
    for i in range(1, num_of_pages + 1):
        page_url = f'https://movie.naver.com/movie/bi/mi/pointWriteFormList.naver?code={id_movie}&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page={i}'
        response = rq.get(page_url)
        
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            
            list_score_temp = extract_text(soup.select('body > div > div > div.score_result > ul > li > div.star_score > em'))
            list_comment_temp = extract_comment(soup.find_all(id=re.compile(r'_filtered_ment_\d')))
            
            list_score.extend(list_score_temp)
            list_comment.extend(list_comment_temp)
        else:
            print('error:', page_url)
    list_id_movie = [id_movie] * len(list_score)
    return list_id_movie, list_score, list_comment

list_movie_comment = jl.Parallel(n_jobs=-1, prefer="threads")(jl.delayed(scrape_comment)(id, NUM_PAGES_COMMENT) for id, title in tqdm(list_total_movie))

list_movie_comment[0]
list_id_movie = np.array(list_movie_comment)[:, 0]
list_score = np.array(list_movie_comment)[:, 1]
list_comment = np.array(list_movie_comment)[:, 2]

list_id_movie = list(itertools.chain.from_iterable(list_id_movie))
list_score = list(itertools.chain.from_iterable(list_score))
list_comment = list(itertools.chain.from_iterable(list_comment))

len(list_id_movie) == len(list_score) == len(list_comment)

df_comment = pd.DataFrame({'id_movie': list_id_movie, 'score': list_score, 'comment': list_comment})


  0%|          | 0/250 [00:00<?, ?it/s]

In [39]:
df_comment.to_excel('df_comment.xlsx')
df_comment

Unnamed: 0,id_movie,score,comment
0,191613,10,이 영화 완내스!!! 박박!! 절레절레동화!! 허니잼 꿀잼 개꿀!
1,191613,10,완벽합니다....눈물줄줄 흘렸습니다....겨울에 알맞은 영화...
2,191613,10,생각없이 봤다가 크게 감동받은 크리스마스 선물 같은 영화
3,191613,10,인류애 되찾는 영화.. 연말영화로 추천합니다!
4,191613,10,가슴이 왠지 먹먹해지는 행복한 영화였어요
...,...,...,...
7495,190244,10,넘나 달달하고 아련하고 ㅜㅜ.. 겨울에 보기 딱 좋은 로코였음!
7496,190244,10,"프랑스 영화가 처음이라 낯설 것 같았지만, 생각보다 개그 요소도 많고 특이한 설정덕..."
7497,190244,10,제 인생 영화가 되었어요.
7498,190244,10,평행세계에서도 같은 사람과 다시 사랑에? 보는 내내 부럽더라
