In [38]:
!rm -rf crawler

In [39]:
!scrapy startproject crawler

New Scrapy project 'crawler', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/Scrapy/crawler

You can start your first spider with:
    cd crawler
    scrapy genspider example example.com


In [40]:
!tree crawler

[01;34mcrawler[00m
├── [01;34mcrawler[00m
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   └── [01;34mspiders[00m
│       └── __init__.py
└── scrapy.cfg

2 directories, 7 files


#### scrapy의 구조
- spiders
    - 어떤 웹 서비스를 어떻게 크롤링할 것인지에 대한 코드 작성 (.py로 작성)
- items.py
    - 모델에 해당하는 코드, 저장하는 데이터의 자료구조를 설정
- pipelines.py
    - scraping한 결과물을 item 형태로 구성하고 처리하는 방법에 대한 코드
- settings.py
    - scraping할 때의 환경 설정 값을 지정
    - robots.txt 따를지 안따를지

In [41]:
import scrapy
import requests
from scrapy.http import TextResponse

#### 1. xpath 확인

In [42]:
req = requests.get('http://corners.gmarket.co.kr/Bestsellers')
response = TextResponse(req.url, body=req.text, encoding='utf-8')

In [43]:
links = response.xpath('//*[@id="gBestWrap"]/div/div[3]/div[2]/ul/li/div[1]/a/@href').extract()
len(links), links[0]

(200,
 'http://item.gmarket.co.kr/Item?goodscode=1930786012&ver=637400801055234986')

In [44]:
req = requests.get(links[0])
response = TextResponse(req.url, body=req.text, encoding='utf-8')
title = response.xpath('//*[@id="itemcase_basic"]/h1/text()')[0].extract()
s_price = response.xpath('//*[@id="itemcase_basic"]/p/span/span/text()')[0].extract().replace(",", "")
o_price = response.xpath('//*[@id="itemcase_basic"]/p/span/strong/text()')[0].extract().replace(",", "")
discount_rate = str(round(int(s_price) / int(o_price) * 100, 2)) + '%'
title, s_price, o_price, discount_rate

('20년 멋진밥상 흥양농협 햅쌀(단일품종) 20kg ', '63900', '59900', '106.68%')

#### 2. items.py 작성

In [45]:
!cat crawler/crawler/items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class CrawlerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


In [46]:
%%writefile crawler/crawler/items.py
import scrapy


class CrawlerItem(scrapy.Item):
    title = scrapy.Field()
    s_price = scrapy.Field()
    o_price = scrapy.Field()
    discount_rate = scrapy.Field()
    link = scrapy.Field()

Overwriting crawler/crawler/items.py


#### 3. spider.py 작성

In [47]:
%%writefile crawler/crawler/spiders/spiders.py
import scrapy
from crawler.items import CrawlerItem

class Spider(scrapy.Spider):
    name = 'GmarketBestsellers'
    allow_domain = ['gmarket.co.kr']
    start_urls = ['http://corners.gmarket.co.kr/Bestsellers']
    
    def parse(self, response):
        links = response.xpath('//*[@id="gBestWrap"]/div/div[3]/div[2]/ul/li/div[1]/a/@href').extract()
        for link in links[:10]:
            yield scrapy.Request(link, callback=self.page_content)
    
    def page_content(self, response):
        item = CrawlerItem()
        item['title'] = response.xpath('//*[@id="itemcase_basic"]/h1/text()')[0].extract()
        item['o_price'] = response.xpath('//*[@id="itemcase_basic"]/p/span/span/text()')[0].extract().replace(",", "")
        try:
            item['s_price'] = response.xpath('//*[@id="itemcase_basic"]/p/span/strong/text()')[0].extract().replace(",", "")
        except:
            item['o_price'] = item['s_price']
        item['discount_rate'] = str(round((1 - int(item['s_price']) / int(item['o_price'])) * 100, 2)) + '%'
        item['link'] = response.url
        yield item

Writing crawler/crawler/spiders/spiders.py


#### 4. Scrapy 실행

In [48]:
%%writefile run.sh
cd crawler
scrapy crawl GmarketBestsellers

Writing run.sh


In [49]:
!chmod +x run.sh

In [50]:
!./run.sh

2020-11-04 09:48:38 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: crawler)
2020-11-04 09:48:38 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 03:09:48) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-04 09:48:38 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-04 09:48:38 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'crawler',
 'NEWSPIDER_MODULE': 'crawler.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['crawler.spiders']}
2020-11-04 09:48:38 [scrapy.extensions.telnet] INFO: Telnet Password: d276b085c9a90599
2020-11-04 09:48:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scra

2020-11-04 09:48:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1926067512&ver=637400801190359810> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 09:48:41 [scrapy.core.scraper] DEBUG: Scraped from <200 http://item.gmarket.co.kr/Item?goodscode=1930786012&ver=637400801190359810>
{'discount_rate': '6.26%',
 'link': 'http://item.gmarket.co.kr/Item?goodscode=1930786012&ver=637400801190359810',
 'o_price': '63900',
 's_price': '59900',
 'title': '20년 멋진밥상 흥양농협 햅쌀(단일품종) 20kg '}
2020-11-04 09:48:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1840147374&ver=637400801190359810> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 09:48:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1836449519&ver=637400801190359810> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 09:48:41 [scrapy.core.scraper] DEBUG: Scraped from <200 http://

In [1]:
# 결과를 csv로 저장

In [51]:
%%writefile run.sh
cd crawler
scrapy crawl GmarketBestsellers -o GmarketBestsellers.csv

Overwriting run.sh


In [52]:
!./run.sh

2020-11-04 09:48:49 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: crawler)
2020-11-04 09:48:49 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 03:09:48) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-04 09:48:49 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-04 09:48:49 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'crawler',
 'NEWSPIDER_MODULE': 'crawler.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['crawler.spiders']}
2020-11-04 09:48:49 [scrapy.extensions.telnet] INFO: Telnet Password: c7d521fa228fc7b3
2020-11-04 09:48:49 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scra

2020-11-04 09:48:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://item.gmarket.co.kr/Item?goodscode=1741016466&ver=637400801300536319>
{'discount_rate': '50.0%',
 'link': 'http://item.gmarket.co.kr/Item?goodscode=1741016466&ver=637400801300536319',
 'o_price': '55800',
 's_price': '27900',
 'title': '삼진어묵  옛날모듬어묵 1Kg x 3개 '}
2020-11-04 09:48:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://item.gmarket.co.kr/Item?goodscode=1639345413&ver=637400801300536319>
{'discount_rate': '50.0%',
 'link': 'http://item.gmarket.co.kr/Item?goodscode=1639345413&ver=637400801300536319',
 'o_price': '19800',
 's_price': '9900',
 'title': '양평해장국 600g X 3팩 /선봉식품/즉석국/탕/찌개 '}
2020-11-04 09:48:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1836449519&ver=637400801300536319> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 09:48:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1926067512&ver=6374

In [53]:
!ls crawler/

GmarketBestsellers.csv	crawler  scrapy.cfg


In [54]:
import pandas as pd

In [55]:
files = !ls crawler/
files

['GmarketBestsellers.csv', 'crawler', 'scrapy.cfg']

In [62]:
df = pd.read_csv("crawler/{}".format(files[0]))
df.tail

<bound method NDFrame.tail of   discount_rate                                               link  o_price  \
0        20.11%  http://item.gmarket.co.kr/Item?goodscode=15559...    17400   
1          3.0%  http://item.gmarket.co.kr/Item?goodscode=18364...    22000   
2         6.26%  http://item.gmarket.co.kr/Item?goodscode=19307...    63900   
3         2.38%  http://item.gmarket.co.kr/Item?goodscode=18369...    58900   
4         50.0%  http://item.gmarket.co.kr/Item?goodscode=17410...    55800   
5         50.0%  http://item.gmarket.co.kr/Item?goodscode=16393...    19800   
6         2.57%  http://item.gmarket.co.kr/Item?goodscode=18364...    29550   
7         50.0%  http://item.gmarket.co.kr/Item?goodscode=19260...    30800   

   s_price                                              title  
0    13900         [청정원] 스파게티/파스타소스 600gx3 +면500g x2 or파우치증정   
1    21340                            하남 핫푸드  하남쭈꾸미 350g X3팩   
2    59900                       20년 멋진밥상 흥양농협 햅쌀(단일품종) 20kg   
3 

#### 5. Pipeline 설정
- item을 설정하기 전에 실행되는 코드 정의

In [58]:
import requests
import json

In [59]:
def send_slack(msg):
    WEBHOOK_URL = 'https://hooks.slack.com/services/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'
    payload = {
        "channel" : "일반",
        "username" : "YJ",
        "text" : msg,
    }
    requests.post(WEBHOOK_URL, json.dumps(payload))

In [60]:
send_slack('테스트')

In [61]:
!cat crawler/crawler/pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class CrawlerPipeline:
    def process_item(self, item, spider):
        return item


In [63]:
%%writefile crawler/crawler/pipelines.py
import requests
import json

class CrawlerPipeline(object):
    
    def __send_slack(self, msg):
        WEBHOOK_URL = 'https://hooks.slack.com/services/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'
        payload = {
        "channel" : "일반",
        "username" : "YJ",
        "text" : msg,
        }
        requests.post(WEBHOOK_URL, json.dumps(payload))
    
    def process_item(self, item, spider):
        keyword = 'g'
        print("="*100)
        print(item["title"], keyword)
        print("="*100)
        if keyword in item["title"]:
            self.__send_slack("{},{},{}".format(
                item["title"], item["s_price"], item["link"]))
        return item

Overwriting crawler/crawler/pipelines.py


In [65]:
# pipeline 설정 : settings.py

# ```
# ITEM_PIPELINES = {
#     'crawler.pipelines.CrawlerPipeline' : 300,
# }
# ```

In [66]:
!echo "ITEM_PIPELINES = {" >> crawler/crawler/settings.py
!echo "    'crawler.pipelines.CrawlerPipeline' : 300," >> crawler/crawler/settings.py
!echo "}" >> crawler/crawler/settings.py

In [67]:
!tail -n 3 crawler/crawler/settings.py

ITEM_PIPELINES = {
    'crawler.pipelines.CrawlerPipeline' : 300,
}


In [68]:
!./run.sh

2020-11-04 10:15:44 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: crawler)
2020-11-04 10:15:44 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct 16 2020, 03:09:48) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-5.4.0-1029-aws-x86_64-with-debian-buster-sid
2020-11-04 10:15:44 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-11-04 10:15:44 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'crawler',
 'NEWSPIDER_MODULE': 'crawler.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['crawler.spiders']}
2020-11-04 10:15:44 [scrapy.extensions.telnet] INFO: Telnet Password: 4eeaa26dda647c82
2020-11-04 10:15:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scra

2020-11-04 10:15:47 [urllib3.connectionpool] DEBUG: https://hooks.slack.com:443 "POST /services/T01DG7CV88Z/B01EBGWHE2V/eiAttgyYje93Q21oQPkyWesQ HTTP/1.1" 200 22
2020-11-04 10:15:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://item.gmarket.co.kr/Item?goodscode=1930786012&ver=637400817447778258>
{'discount_rate': '6.26%',
 'link': 'http://item.gmarket.co.kr/Item?goodscode=1930786012&ver=637400817447778258',
 'o_price': '63900',
 's_price': '59900',
 'title': '20년 멋진밥상 흥양농협 햅쌀(단일품종) 20kg '}
2020-11-04 10:15:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1926067512&ver=637400817447778258> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 10:15:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode=1639345413&ver=637400817447778258> (referer: http://corners.gmarket.co.kr/Bestsellers)
2020-11-04 10:15:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://item.gmarket.co.kr/Item?goodscode