#### 1. 프로젝트 생성

In [82]:
!scrapy startproject naver_movie

New Scrapy project 'naver_movie', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/Scrapy/naver_movie

You can start your first spider with:
    cd naver_movie
    scrapy genspider example example.com


#### 2. Items 설정
- 제목, 관객수, 평점

In [83]:
!cat naver_movie/naver_movie/items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NaverMovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


In [84]:
%%writefile naver_movie/naver_movie/items.py
import scrapy

class NaverMovieItem(scrapy.Item):
    title = scrapy.Field()
    count = scrapy.Field()
    star = scrapy.Field()

Overwriting naver_movie/naver_movie/items.py


#### 3. Xpath 확임

In [72]:
import requests
import scrapy
from scrapy.http import TextResponse

In [73]:
req = requests.get("https://movie.naver.com/movie/running/current.nhn#")
response = TextResponse(req.url, body=req.text, encoding='utf-8')

In [74]:
links = response.xpath('//*[@id="content"]/div[1]/div[1]/div[3]/ul/li/dl/dt/a/@href').extract()
len(links), links[1]

(129, '/movie/bi/mi/basic.nhn?code=189141')

In [75]:
link = response.urljoin(links[1])
link

'https://movie.naver.com/movie/bi/mi/basic.nhn?code=189141'

In [76]:
# 상세 데이터 수집

In [77]:
req = requests.get(link)
response = TextResponse(req.url, body=req.text, encoding='utf-8')

In [78]:
title = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/h3/a[1]/text()').extract()[0]
count = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/dl/dd[5]/div/p[2]/text()').extract()[0]
star = response.xpath('//*[@id="actualPointPersentBasic"]/div/em/text()').extract()
star = "".join(star)
title, count, star

('삼진그룹 영어토익반', '1,011,155명', '9.20')

#### 4. spider 작성

In [91]:
%%writefile naver_movie/naver_movie/spiders/spider.py
import scrapy
from naver_movie.items import NaverMovieItem


class MovieSpider(scrapy.Spider):
    name = "NaverMovie"
    allow_domain = ["https://movie.naver.com"]
    start_urls = ["https://movie.naver.com/movie/running/current.nhn"]

    def parse(self, response):
        links = response.xpath('//*[@id="content"]/div[1]/div[1]/div[3]/ul/li/dl/dt/a/@href').extract()
        for link in links:
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_page_contents)

    def parse_page_contents(self, response):
        item = NaverMovieItem()
        item["title"] = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/h3/a[1]/text()').extract()[0]
        try:
            item["count"] = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/dl/dd[5]/div/p[2]/text()').extract()[0]
        except:
            item["count"] = "0명"
        star = response.xpath('//*[@id="actualPointPersentBasic"]/div/em/text()').extract()
        item["star"] = "".join(star)
        yield item

Overwriting naver_movie/naver_movie/spiders/spider.py


#### 5. scrapy 실행

In [92]:
%%writefile run.sh
cd naver_movie
scrapy crawl NaverMovie -o naver_movie.csv

Overwriting run.sh


In [93]:
!./run.sh

2020-11-04 11:10:34 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: naver_movie)
2020-11-04 11:10:34 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 03:09:48) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-04 11:10:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-04 11:10:34 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'naver_movie',
 'NEWSPIDER_MODULE': 'naver_movie.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['naver_movie.spiders']}
2020-11-04 11:10:34 [scrapy.extensions.telnet] INFO: Telnet Password: 63900e09b2fb5fc4
2020-11-04 11:10:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.Memo

In [88]:
!chmod +x run.sh

#### 6. settings.py 파일 변경
- forbidden robots.txt

In [95]:
!head -n 25 naver_movie/naver_movie/settings.py | tail -n 5


# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)


In [96]:
!sed -i 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' naver_movie/naver_movie/settings.py

In [97]:
!./run.sh

2020-11-04 11:11:47 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: naver_movie)
2020-11-04 11:11:47 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 03:09:48) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-04 11:11:47 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-04 11:11:47 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'naver_movie',
 'NEWSPIDER_MODULE': 'naver_movie.spiders',
 'SPIDER_MODULES': ['naver_movie.spiders']}
2020-11-04 11:11:47 [scrapy.extensions.telnet] INFO: Telnet Password: a3b02f9828506b56
2020-11-04 11:11:47 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extens

2020-11-04 11:11:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=189628>
{'count': '0명', 'star': '9.94', 'title': '젊은이의 양지'}
2020-11-04 11:11:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=17875> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=155123> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=189393>
{'count': '0명', 'star': '9.07', 'title': '교실 안의 야크'}
2020-11-04 11:11:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=18763>
{'count': '0명', 'star': '', 'title': '바스켓볼 다이어리'}
2020-11-04 11:11:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movi

2020-11-04 11:11:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=180851>
{'count': '0명', 'star': '8.83', 'title': '작은 빛'}
2020-11-04 11:11:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=190458>
{'count': '0명', 'star': '9.00', 'title': '여름날'}
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=146560> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=27219> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=167105>
{'count': '0명', 'star': '8.58', 'title': '암수살인'}
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/

2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=189000> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=98738> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=178687> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=167421> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=96379>
{'count': '0명', 'star': '9.13', 'title': '비긴 어게인'}
2020-11-04 11:11:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://mo

2020-11-04 11:11:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=181286> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=168012> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=192191> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:50 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=133459>
{'count': '0명', 'star': '8.25', 'title': '호크니'}
2020-11-04 11:11:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=185282> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://mov

2020-11-04 11:11:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=185287>
{'count': '0명', 'star': '9.57', 'title': '남과 여: 여전히 찬란한'}
2020-11-04 11:11:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=198425> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movie.naver.com/movie/bi/mi/basic.nhn?code=194334> (referer: https://movie.naver.com/movie/running/current.nhn)
2020-11-04 11:11:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=186114>
{'count': '0명', 'star': '9.83', 'title': '밥정'}
2020-11-04 11:11:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://movie.naver.com/movie/bi/mi/basic.nhn?code=142265>
{'count': '0명', 'star': '9.60', 'title': '알피니스트 - 어느 카메라맨의 고백'}
2020-11-04 11:11:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movi

In [98]:
import pandas as pd

In [100]:
df = pd.read_csv("naver_movie/naver_movie.csv")
df.tail(2)

Unnamed: 0,count,star,title
127,0명,9.47,노트북
128,"1,011,155명",9.19,삼진그룹 영어토익반
