In [15]:
import MySQLdb

In [16]:
db = MySQLdb.connect(host="localhost",  # host      # 127.0.0.1
                     user="root",       # username  # root
                     passwd="1234",     # password  # 1234
                     db="naver_movie")         # DB NAME   

db.set_character_set('utf8')

In [17]:
db

<_mysql.connection open to 'localhost' at 7c054b8>

In [18]:
# 데이터베이스 커서(Cursor)는 일련의 데이터에 
# 순차적으로 액세스할 때 검색 및 "현재 위치"를 
# 포함하는 데이터 요소이다.
cur = db.cursor()
# https://movie.naver.com/movie/bi/mi/basic.nhn?code=161967

In [19]:
# IF(조건, 참값, 거짓값)
query = """
        CREATE TABLE IF NOT EXISTS naver_movie
            (code    INTEGER      NOT NULL      PRIMARY KEY,
             created_at    DATETIME DEFAULT CURRENT_TIMESTAMP,
             title   VARCHAR(100) NOT NULL,
             story   TEXT,
             open_date  DATE)
             ENGINE INNODB;
        """
cur.execute(query)


  # This is added back by InteractiveShellApp.init_path()


0

In [20]:
query = """
        CREATE TABLE IF NOT EXISTS movie_comment(
            score    INTEGER    NOT NULL,
            content  TEXT       NOT NULL,
            movie_code INTEGER,
            writer   VARCHAR(100),
            write_date DATETIME,
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (writer, write_date),
            FOREIGN KEY(movie_code)
            REFERENCES naver_movie(code) on delete cascade)
            ENGINE INNODB;
"""
cur.execute(query)

  


0

## 영화 크롤링

In [21]:
import requests
from bs4 import BeautifulSoup
from urllib import parse

In [22]:
MOVIE_LIST_URL = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn'

# ?sel=pnt&date=20190616

In [23]:
def get_movie_list(start_page=1, end_page=1):
    URL = "https://movie.naver.com/movie/sdb/rank/rmovie.nhn"
    params = {
        'sel':'pnt',
        'date':20190616
    }
    
    movie_list = []
    for i in range(start_page, end_page+1):
        params['page'] = i
        
        resp = requests.get(URL, params)
        soup = BeautifulSoup(resp.content)
        
        list_ranking = soup.find(class_='list_ranking').find_all('tr')
        for movie_tr in list_ranking:
            try:
                code = movie_tr.find('a')['href'].split('=')[-1]
                title = movie_tr.find('a').text
            
                movie_list.append({
                    'code':code,
                    'title':title
                })
            except Exception:
                pass
            
    return movie_list

In [24]:
def get_movie_info(movie):
    code = movie.get('code')
    title = movie.get('title')
    
    base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn'
    params = {
        'code': code,
    }
    
    resp = requests.get(base_url, params)
    soup = BeautifulSoup(resp.content)
    
    open_date_tag = soup.\
    find('dl', class_='info_spec').\
    find('dd').find_all('span')[-1]
    
    # 여기부터
    open_date = ""
    for date in open_date_tag.find_all('a'):
        open_date += date.text
    
    open_date = "-".join(open_date.strip().\
                         split('.'))
    
    # 여기까지
    story = soup.find('div', class_='story_area')
    
    movie['story'] = story
    movie['open_date'] = open_date
    
    return movie

In [25]:
movie_list = get_movie_list()
movie_list[0]

{'code': '171539', 'title': '그린 북'}

In [26]:
get_movie_info(movie_list[0])

{'code': '171539', 'title': '그린 북', 'story': <div class="story_area">
 <div class="title_area">
 <h4 class="h_story"><strong class="blind">줄거리</strong></h4>
 </div>
 <h5 class="h_tx_story">언제 어디서든 바른 생활! 완벽한 천재 뮤지션 ‘돈 셜리’<br/>
 원칙보다 반칙! 다혈질 운전사 ‘토니’<br/>
 취향도, 성격도 완벽히 다른 두 남자의 특별한 우정이 시작된다!</h5>
 <p class="con_tx">1962년 미국, 입담과 주먹만 믿고 살아가던 토니 발레롱가(비고 모텐슨)는 교양과 우아함 그 자체인
 <br/> 천재 피아니스트 돈 셜리(마허샬라 알리) 박사의 운전기사 면접을 보게 된다.
 <br/> 
 <br/> 백악관에도 초청되는 등 미국 전역에서 콘서트 요청을 받으며 명성을 떨치고 있는 돈 셜리는
 <br/> 위험하기로 소문난 미국 남부 투어 공연을 떠나기로 결심하고,
 <br/> 투어 기간 동안 자신의 보디가드 겸 운전기사로 토니를 고용한다.
 <br/> 
 <br/> 거친 인생을 살아온 토니 발레롱가와 교양과 기품을 지키며 살아온 돈 셜리 박사.
 <br/> 생각, 행동, 말투, 취향까지 달라도 너무 다른 두 사람은
 <br/> 그들을 위한 여행안내서 ‘그린북’에 의존해 특별한 남부 투어를 시작하는데…</p>
 <button class="story_more" id="toggleMakingnoteButton" onclick="storyAndNote.toggleMakingnote();" type="button"><em class="blind">제작노트 보기</em></button><!-- N=a:mai.story -->
 </div>, 'open_date': '2019-01-09'}

In [27]:
# query = """
#         CREATE TABLE IF NOT EXISTS naver_movie
#             (code    INTEGER      NOT NULL      PRIMARY KEY,
#              created_at    DATETIME DEFAULT CURRENT_TIMESTAMP,
#              title   VARCHAR(100) NOT NULL,
#              story   TEXT,
#              open_date  DATE)
#         """
def insert_movie(movie):
    code = movie.get('code')
    title = movie.get('title')
    story = movie.get('story')
    
    story = str(story)
    open_date = movie.get('open_date')
    
    story = db.escape_string(story)
    query = """
    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('{0}', '{1}', %s, '{2}')
    """.format(code, title, open_date)
    
    print(query)
    cur.execute(query, [story])
    db.commit()
    
    

In [28]:
sample_movie = get_movie_info(movie_list[0])
# insert_movie(sample_movie)

In [29]:
from MySQLdb import IntegrityError
movie_list = get_movie_list()
for movie in movie_list:
    
    movie = get_movie_info(movie)
    try:
        insert_movie(movie)
    except IntegrityError:
        pass
    except Exception:
        pass
    


    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('171539', '그린 북', %s, '2019-01-09')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('174830', '가버나움', %s, '2019-01-24')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('144906', '베일리 어게인', %s, '2018-11-22')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('163788', '알라딘', %s, '2019-05-23')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('169240', '아일라', %s, '2018-06-21')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('151196', '원더', %s, '2017-12-27')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('157243', '당갈', %s, '2018-04-25')
    

    INSERT INTO naver_movie
    (code, title, story, onpen_date)
    VALUES('156464', '보헤미안 랩소디', %s, '2018-10-31')
    

    INSERT INTO naver_movie
    (code, title, story, o

In [30]:
def get_comment_list(code, last_page=1):
    url = 'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn'
    params = {
        'code': code,
        'type': 'after',
        'order': 'newest'
    }
    result_list = []
    for page in range(1, last_page+1):
        params['page'] = page
        
        resp = requests.get(url, params)
        soup = BeautifulSoup(resp.content)
        reple_list = soup.find('div', class_='score_result').find_all('li')
        
        for reple in reple_list:
            score = reple.find(class_='star_score').text.strip()
            content = reple.find('p')
            name_n_date = content.find_next_sibling('dl').find_all('em')
            
            content = content.text
            
            nickname = name_n_date[0]
            date = name_n_date[1]
            result_list.append({
                'write_name':nickname,
                'write_date': date.text,
                'content': content,
                'score': score,
                'code':code
            })
    return result_list
        


In [31]:
get_comment_list('171539')

[{'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15816086, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>봉봉(lemo****)</span>
  </a>
  </em>,
  'write_date': '2019.06.17 00:07',
  'content': '따뜻한 브로맨스 로드무비.  ',
  'score': '10',
  'code': '171539'},
 {'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15812565, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>Windy(appl****)</span>
  </a>
  </em>,
  'write_date': '2019.06.15 21:46',
  'content': '평점 클라스 봐라 ㅋㅋ 너무한거 아니냐?  ',
  'score': '1',
  'code': '171539'},
 {'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15811976, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>lunasea(luna****)</span>
  </a>
  </em>,
  'write_date': '2019.06.15 17:51',
  'content': '블루레이로 감삼.두 배우의 연기가 대단하고 내용 전개 중에 심한 갈등이 나올거 같았는데 나오지 않아 편하고 재밌게 볼수 있었다.  ',


In [32]:
def insert_comment(comment):
    score = comment.get('score')
    writer = comment.get('writer')
    write_date = comment.get('write_date')
    write_date = '-'.join(write_date.split('.'))
    
    content = comment.get('content')
    movie_code = comment.get('code')
    
    query = """
    INSERT INTO movie_comment
    (score, writer, write_date, content, movie_code)
    VALUES('{}', '{}', '{}', '{}', '{}')
    """.format(score, writer, write_date, content, movie_code)
    
    cur.execute(query)
    db.commit()
    

In [33]:
comment_list = get_comment_list('171539')
for comment in comment_list:
    insert_comment(comment)

IntegrityError: (1062, "Duplicate entry 'None-2019-06-17 00:07:00' for key 'PRIMARY'")

In [34]:
for movie in movie_list:
    try:
        comment_list = get_comment_list(movie['code'])
        for comment in comment_list:
            insert_comment(comment)
    except Exception:
        pass

In [None]:
# https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=pnt&date=20190616

In [None]:
import requests
from urllib import parse
from bs4 import BeautifulSoup
import datetime
import math

In [None]:
# movie_list

# 한 화면에 50페이지
PAGE_SIZE = 50
MOVIE_LENGTH = 100
pages = math.ceil(MOVIE_LENGTH/PAGE_SIZE)
pages

In [None]:
def get_movie_list(start_page, end_page):
    today = datetime.datetime.now()
    
    movie_list = []
    for page in range(start_page, end_page+1):
        movie_list_url = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn'
        params = {
            'sel':'pnt',
            'date':today.strftime('%Y%m%d'),
            'page':page
        }
        resp = requests.get(movie_list_url)
        resp.encoding = 'utf-8'
    
        soup = BeautifulSoup(resp.content)
        # list_ranking = soup.find(class_='list_ranking').find_all('tr')
        list_ranking = soup.select('.list_ranking>tbody>tr')
    
        for movie_tr in list_ranking:
            all_td = movie_tr.find_all('td')
            try:
                rank = all_td[0].img['alt']
                link = all_td[1].find('a')['href']
                title = all_td[1].find('a').text
                
                code = parse.parse_qs(parse.urlsplit(link).query)['code'][0]
            except TypeError:
                pass

            else:
                movie_list.append({
                    'img':img,
                    'link':link,
                    'title':title,
                    'code':code
                })
                
    return movie_list
    
    

In [None]:
def get_movie_detail(code):
    base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn'
    params = {
        'code':code
    }
    resp = requests.get(base_url, params)
    resp.encoding ='utf-8'
    soup = BeautifulSoup(resp.content)
    
    opend = soup.find('dt', text='개요()').find_next_sibling('dd').find_all('span')[-1].text.strip()
    opend = ''.join(opend.split('\n'))[:10]

    story = soup.find('div', class_='story_area')
    print(story)
    return {
        'opend':opend,
        'story':story
    }
    
    
def get_movie_comment(code, last_comment='15788558'):
    base_url = '/movie/bi/mi/pointWriteFormList.nhn'
    page = 1
    comment_list = []
    
    while True:
        params={
            'code':171539,
            'type':'after',
            'order':'newest',
            'page':page
        }
        resp = requests.get(base_url, params)
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.content)
        
        comments = soup.find('div', class_='score_result').find_all('li')
        for comment in comments:
            star = comment.find('div', class_='star_score').text
            reple = comment.find('div', class_='score_reple').find('p').text
            
            dd = comment.find('div', class_='score_reple').find('dl').find_all('dd')
            user = dd[0]
            created_at = dd[1]
            
            comment_id = comment.find('div', class_='btn_area').find_all('span')[-1]['class'][0].split('_')[1]
            
            if int(comment_id) <= int(last_comment):
                return comment_list
            
            # 실습용 코드
            if page ==3:
                break
            
            comment_list.append({
                'star':star,
                'reple':reple,
                'user': user,
                'created_at': created_at,
                'comment_id': comment_id
            })
            
        
        page += 1
    return comment_list
    
    
    
    

In [None]:
movie_list = get_movie_list(1, pages)

In [None]:
movie_list

In [None]:
def insert_movie(movie):
    code = movie.get('code')
    title = movie.get('title')
    open_date = movie.get('open_date')
    
    query = """
    INSERT INTO naver_movie
    (code, title, open_date)
    VALUES('{}', '{}', '{}')
    """.format(code, title, open_date)
    
    cur.execute(query)
    
def insert_comment(movie_code, comment):
    # movie_comment

    write = comment.get('writer')
    score = comment.get('score')
    wrter = comment.get('writer')
    write_date = comment.get('write_date')
    content = comment.get('content')
    
    query = """
    INSERT INTO movie_comment
    (score, content, writer, write_date, movie_code)
    VALUES('{}', '{}', '{}')
    """.format(score, content, writer, write_date, movie_code)
    
    cur.execute(query)
    

In [None]:
insert_movie({
    'code': '12',
    'title': 'ddd',
    'open_date': '2011.01.12',
})

In [None]:
for movie in movie_list:
    detail = get_movie_detail(movie['code'])
    
    open_date = detail['opend']
    story = detail['story']
    comment_list = get_movie_comment(movie['code'])

In [None]:
resp = requests.get(movie_list_url)
resp

In [None]:
# INSERT INTO table (column_list)
# VALUES (value_list)
# ON DUPLICATE KEY UPDATE
#    c1 = v1, 
#    c2 = v2,
#    ...;

In [None]:
cur.close()