# Naver Movie Review Crawling

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
import math

In [None]:
def get_movie_code(movie_nm):
    movie_code = None

    search_url = "https://movie.naver.com/movie/search/result.nhn?query=" + movie_nm + "&section=all&ie=utf8"
    search_resp = requests.get(search_url)
    search_html = BeautifulSoup(search_resp.content, 'html.parser')
    if search_html.find('ul', {'class': 'search_list_1'}) is not None: # 해당 영화 검색 결과가 존재하는 경우
        a_tag = search_html.find('ul', {'class': 'search_list_1'}).find('a')
        re_movie = re.compile('code=[0-9]{1,6}')
        movie_code = re.sub('code=', '', re_movie.findall(str(a_tag))[0])
    return(movie_code)

In [None]:
def get_review_info(movie_nm):
    review_url, num_pages = None, 0

    movie_code = get_movie_code(movie_nm)
    if movie_code is not None: # 해당 영화 검색 결과가 존재하는 경우
        review_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=" + movie_code
        review_html = BeautifulSoup(requests.get(review_url).content, 'html.parser')
        review_score = review_html.find('div', {'class': 'score_total'})
        if review_score is not None: # 해당 영화 리뷰 페이지가 존재하는 경우 (국내개봉작)
            review_count = int(review_score.find('strong').findChildren('em')[-1].getText().replace(',', ''))
            num_pages = int(math.ceil(review_count/10))
    return(review_url, num_pages)

In [None]:
def get_review_page(movie_nm, url):
    resp = requests.get(url)
    html = BeautifulSoup(resp.content, 'html.parser')
    score_result = html.find('div', {'class': 'score_result'})
    lis = score_result.findAll('li')
    review_page = pd.DataFrame()
#     for li in [li for li in lis if li.findAll('a') != []]: # filtered review 수집 제외
    for li in lis:
        nickname = li.find('a').find('span').getText() if (li.find('a') and li.find('a').find('span')) is not None else None
        created_at = datetime.strptime(li.find('dt').findAll('em')[-1].getText(), "%Y.%m.%d %H:%M") if li.find('dt') is not None else None
        review_text = li.find('p').getText().translate(str.maketrans({"\n": "", "\r": "", "\t": ""})) if li.find('p') is not None else None
        score = li.find('em').getText() if li.find('em') is not None else None
        btn_likes = li.find('div', {'class': 'btn_area'}).findAll('strong') if li.find('div', {'class': 'btn_area'}) is not None else None
        like = btn_likes[0].getText() if btn_likes is not None else None
        dislike = btn_likes[1].getText() if btn_likes is not None else None
        watch_movie = li.find('span', {'class':'ico_viewer'})
        review_row = {"movie_nm": movie_nm,
                      "nickname": nickname,
                      "review": review_text,
                      "score": score,
                      "like": like,
                      "dislike": dislike,
                      "created_at": created_at,
                      "watch_movie": watch_movie and True or False}
        review_page = review_page.append(review_row, ignore_index = True)
    return(review_page)

In [None]:
def get_score(movie_nm):
    score = ['', '', '']
    
    movie_code = get_movie_code(movie_nm)
    if movie_code is not None: # 해당 영화 검색 결과가 존재하는 경우
        score_url = "https://movie.naver.com/movie/bi/mi/basic.nhn?code=" + movie_code
        score_resp = requests.get(score_url)
        socre_html = BeautifulSoup(score_resp.content, 'html.parser')
        score_main = socre_html.find('div', {'class': 'main_score'})
        
        if score_main is not None:
            watcher_tag = score_main.find('a', {'class': 'ntz_score'}).find('div', {'class': 'star_score'}).findAll('em') if score_main.find('a', {'class': 'ntz_score'}) is not None else ''
            expert_tag = score_main.find('div', {'class': 'spc_score_area'}).find('div', {'class': 'star_score'}).findAll('em') if score_main.find('div', {'class': 'spc_score_area'}) is not None else ''
            netizen_tag = score_main.find('a', {'id': 'pointNetizenPersentWide'}).findAll('em') if score_main.find('a', {'id': 'pointNetizenPersentWide'}) is not None else ''
            score = [re.sub('>|<', '', "".join([str(i) for i in re.compile('>[0-9, .]<').findall("".join([str(i) for i in tag]))])) for tag in [watcher_tag, expert_tag, netizen_tag]]
    return(score)

---

#### OLD

In [None]:
def old_get_review_info(movie_nm):
    review_url, num_pages = None, 0
    search_url = "https://movie.naver.com/movie/search/result.nhn?query=" + movie_nm + "&section=all&ie=utf8"
    search_resp = requests.get(search_url)
    search_html = BeautifulSoup(search_resp.content, 'html.parser')
    if search_html.find('ul', {'class': 'search_list_1'}) is not None: # 해당 영화 검색 결과가 존재하는 경우
        a_tag = search_html.find('ul', {'class': 'search_list_1'}).find('a')
        re_movie = re.compile('code=[0-9]{1,6}')
        movie_code = re.sub('code=', '', re_movie.findall(str(a_tag))[0])
        review_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=" + movie_code
        review_html = BeautifulSoup(requests.get(review_url).content, 'html.parser')
        review_score = review_html.find('div', {'class': 'score_total'})
        if review_score is not None: # 해당 영화 리뷰 페이지가 존재하는 경우 (국내개봉작)
            review_count = int(review_score.find('strong').findChildren('em')[-1].getText().replace(',', ''))
            num_pages = int(math.ceil(review_count/10))
    return(review_url, num_pages)

In [None]:
def old_get_movie_review_subset(movie_nm, url):
    resp = requests.get(url)
    html = BeautifulSoup(resp.content, 'html.parser')
    score_result = html.find('div', {'class': 'score_result'})
    lis = score_result.findAll('li')
    lis_df = pd.DataFrame()
    for li in lis:
        nickname = li.findAll('a')[0].find('span').getText() if li.findAll('a')[0].find('span') is not None else None
        created_at = datetime.strptime(li.find('dt').findAll('em')[-1].getText(), "%Y.%m.%d %H:%M")
        review_text = li.find('p').getText().translate(str.maketrans({"\n": "", "\r": "", "\t": ""}))
        score = li.find('em').getText()
        btn_likes = li.find('div', {'class': 'btn_area'}).findAll('strong')
        like = btn_likes[0].getText()
        dislike = btn_likes[1].getText()
        watch_movie = li.find('span', {'class':'ico_viewer'})
        
        li_df = pd.DataFrame({"movie_nm": [movie_nm],
                              "nickname": [nickname],
                              "review": [review_text],
                              "score": [score],
                              "like": [like],
                              "dislike": [dislike],
                              "created at": [created_at],
                              "watch_movie": [watch_movie and True or False]})
        lis_df = pd.concat([lis_df, li_df])
    return(lis_df)

In [None]:
def old_temp_get_score(movie_nm):
    review_url, num_pages = None, 0
    search_url = "https://movie.naver.com/movie/search/result.nhn?query=" + movie_nm + "&section=all&ie=utf8"
    search_resp = requests.get(search_url)
    search_html = BeautifulSoup(search_resp.content, 'html.parser')
    
    watcher_score, expert_score, netizen_score = '', '', ''
    if search_html.find('ul', {'class': 'search_list_1'}) is not None: # 해당 영화 검색 결과가 존재하는 경우
        a_tag = search_html.find('ul', {'class': 'search_list_1'}).find('a')
        re_movie = re.compile('code=[0-9]{1,6}')
        movie_code = re.sub('code=', '', re_movie.findall(str(a_tag))[0])
        #################################################################
        score_url = "https://movie.naver.com/movie/bi/mi/basic.nhn?code=" + movie_code
        resp = requests.get(score_url)
        html = BeautifulSoup(resp.content, 'html.parser')
        main_score = html.find('div', {'class': 'main_score'})
        
        if main_score is not None:
            score_tag = main_score.findAll('div', {'class': 'star_score'})[0].findAll('em')
            watcher_score = re.sub('>|<', '', "".join([str(i) for i in re.compile('>[0-9, .]<').findall("".join([str(i) for i in score_tag]))]))
            score_tag = main_score.findAll('div', {'class': 'star_score'})[1].findAll('em')
            expert_score = re.sub('>|<', '', "".join([str(i) for i in re.compile('>[0-9, .]<').findall("".join([str(i) for i in score_tag]))]))
            score_tag = main_score.findAll('div', {'class': 'star_score'})[2].findAll('em')
            netizen_score = re.sub('>|<', '', "".join([str(i) for i in re.compile('>[0-9, .]<').findall("".join([str(i) for i in score_tag]))]))
    return(watcher_score, expert_score, netizen_score)