# 네이버 영화 크롤링 3

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import math

In [None]:
def get_review_info(movie_nm):
    review_url, num_pages = None, 0
    search_url = "https://movie.naver.com/movie/search/result.nhn?query=" + movie_nm + "&section=all&ie=utf8"
    search_resp = requests.get(search_url)
    search_html = BeautifulSoup(search_resp.content, 'html.parser')
    if search_html.find('ul', {'class': 'search_list_1'}) is not None: # 해당 영화 검색 결과가 존재하는 경우
        a_tag = search_html.find('ul', {'class': 'search_list_1'}).find('a')
        re_movie = re.compile('code=[0-9]{1,6}')
        movie_code = re.sub('code=', '', re_movie.findall(str(a_tag))[0])
        review_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=" + movie_code
        review_html = BeautifulSoup(requests.get(review_url).content, 'html.parser')
        review_score = review_html.find('div', {'class': 'score_total'})
        if review_score is not None: # 해당 영화 리뷰 페이지가 존재하는 경우 (국내개봉작)
            review_count = int(review_score.find('strong').findChildren('em')[-1].getText().replace(',', ''))
            num_pages = int(math.ceil(review_count/10))
    return(review_url, num_pages)

In [None]:
def get_movie_review_subset(movie_nm, url):
    resp = requests.get(url)
    html = BeautifulSoup(resp.content, 'html.parser')
    score_result = html.find('div', {'class': 'score_result'})
    lis = score_result.findAll('li')
    lis_df = pd.DataFrame()
    for li in lis:
        nickname = li.findAll('a')[0].find('span').getText() if li.findAll('a')[0].find('span') is not None else None
        created_at = datetime.strptime(li.find('dt').findAll('em')[-1].getText(), "%Y.%m.%d %H:%M")
        review_text = li.find('p').getText().translate(str.maketrans({"\n": "", "\r": "", "\t": ""}))
        score = li.find('em').getText()
        btn_likes = li.find('div', {'class': 'btn_area'}).findAll('strong')
        like = btn_likes[0].getText()
        dislike = btn_likes[1].getText()
        watch_movie = li.find('span', {'class':'ico_viewer'})
        
        li_df = pd.DataFrame({"movie_nm": [movie_nm],
                              "nickname": [nickname],
                              "review": [review_text],
                              "score": [score],
                              "like": [like],
                              "dislike": [dislike],
                              "created at": [created_at],
                              "watch_movie": [watch_movie and True or False]})
        lis_df = pd.concat([lis_df, li_df])
    return(lis_df)

In [None]:
mv2019 = pd.read_table("~/MovieTrends/data/movie_list_200602.csv", sep = ",", encoding = "EUC-KR", header=0, names = ["Title", "Subtitle", "TitleEn", "OpenDate", "Count"])
mv2019.OpenDate = pd.Series([pd.Timestamp(datetime.strptime(i, '%Y%m%d')) for i in map(str, mv2019.OpenDate)])
mv2019 = mv2019[["Title", "OpenDate"]]

In [61]:
movie_reviews_list = []
piece_size = 100
for piece_idx in range(0, len(mv2019), piece_size):
    movie_list = mv2019["Title"][piece_idx:piece_idx+piece_size]
    movie_reviews = pd.DataFrame()
    for movie_nm in movie_list:
        review_url, num_pages = get_review_info(movie_nm)
        print(list(mv2019["Title"]).index(movie_nm), "|", movie_nm, "|", num_pages, "pages |", review_url)
        movie_review = pd.DataFrame()
#         for i in range(1, num_pages+1): # 각 영화 모든 리뷰 수집
        for i in range(1, min(num_pages+1, 11)): # 각 영화 리뷰 최대 100개씩 수집
#             print(movie_nm + " | " + str(i) + "/" + str(num_pages))
            movie_review_subset = get_movie_review_subset(movie_nm, review_url + '&page=' + str(i))
            movie_review = pd.concat([movie_review, movie_review_subset])
        movie_reviews = pd.concat([movie_reviews, movie_review])
    movie_reviews_list.append(movie_reviews.reset_index().drop(['index'], axis=1))

598 | 퍼펙트 스트레인저 | 24 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=66498
599 | 퍼펙트 타겟 | 0 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=25134
600 | 퍼펙트맨 | 469 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=180390
601 | 평일 오후 3시의 연인 | 5 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=159743
602 | 포 핸즈 | 1 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=165111
603 | 포레스트 헌터스 워 | 0 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=190242
604 | 폭설 | 0 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=179305
605 | 폴라로이드 | 10 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=162203
606 | 프란치스코 교황: 맨 오브 히스 워드 | 15 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=174748
607 | 프렌드 존 | 0 pages | https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=71580
608 | 프렌즈: 둥지탈출 | 4 p

In [63]:
movie_review_final = pd.concat(movie_reviews_list)

In [66]:
movie_review_final.to_csv("~/MovieTrends/output/movie_review_final.csv")

---

### DEBUG

In [None]:
mv2019[mv2019["Title"] == "공포의 묘지"]

In [None]:
mv2019.iloc[40:45]

In [None]:
movie_nm = '공포의 묘지: 망자의 저주'
review_url, num_pages = get_review_info(movie_nm)
print(movie_nm, " | ", num_pages, "pages | ", review_url)

In [None]:
movie_review = pd.DataFrame()
for i in range(1, num_pages+1):
    #print(movie_nm + " | " + str(i) + "/" + str(num_pages))
    movie_review_subset = get_movie_review_subset(movie_nm, review_url + '&page=' + str(i))
    movie_review = pd.concat([movie_review, movie_review_subset])
movie_reviews = pd.concat([movie_reviews, movie_review])

In [None]:
resp = requests.get(url)
html = BeautifulSoup(resp.content, 'html.parser')
score_result = html.find('div', {'class': 'score_result'})
lis = score_result.findAll('li')
lis_df = pd.DataFrame()
for li in lis:
    nickname = li.findAll('a')[0].find('span').getText() if li.findAll('a')[0].find('span') is not None else None
    created_at = datetime.strptime(li.find('dt').findAll('em')[-1].getText(), "%Y.%m.%d %H:%M")
    review_text = li.find('p').getText().translate(str.maketrans({"\n": "", "\r": "", "\t": ""}))
    score = li.find('em').getText()
    btn_likes = li.find('div', {'class': 'btn_area'}).findAll('strong')
    like = btn_likes[0].getText()
    dislike = btn_likes[1].getText()
    watch_movie = li.find('span', {'class':'ico_viewer'})
    
    li_df = pd.DataFrame({"movie_nm": [movie_nm],
                          "nickname": [nickname],
                          "review": [review_text],
                          "score": [score],
                          "like": [like],
                          "dislike": [dislike],
                          "created at": [created_at],
                          "watch_movie": [watch_movie and True or False]})
    lis_df = pd.concat([lis_df, li_df])

In [None]:
get_movie_review_subset(movie_nm, review_url + '&page=' + str(i))

In [None]:
review_url + '&page=' + str(i)

In [None]:
movie_reviews = pd.concat(movie_review_list)

In [None]:
movie_reviews.to_csv("~/MovieTrends/output/review_all.csv")

---

### FOR TEST

In [None]:
review_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=136990&type=after&page=1"
resp = requests.get(review_url)
html = BeautifulSoup(resp.content, 'html.parser')

In [None]:
score_result = html.find('div', {'class': 'score_result'})
lis = score_result.findAll('li')
# lis[0]

In [None]:
review_text = lis[0].find('p').getText()
review_text

In [None]:
score = lis[0].find('em').getText()
score

In [None]:
like = lis[0].find('div', {'class': 'btn_area'}).findAll('span')[1].getText()
dislike = lis[0].find('div', {'class': 'btn_area'}).findAll('span')[3].getText()
like, dislike

In [None]:
nickname = lis[0].findAll('a')[0].find('span').getText()
nickname

In [None]:
movie_nm = input("Enter movie name: ")
search_url = "https://movie.naver.com/movie/search/result.nhn?query=" + movie_nm + "&section=all&ie=utf8"
search_resp = requests.get(search_url)
search_html = BeautifulSoup(search_resp.content, 'html.parser')
a_tag = search_html.find('ul', {'class': 'search_list_1'}).find('a')

In [None]:
a_tag

In [None]:
re_movie = re.compile('code=[0-9]{6}')
movie_code = re.sub('code=', '', re_movie.findall(str(a_tag))[0])
review_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=" + movie_code + "&type=after&page=1"
return(review_url)