### playnomore
- http://playnomore.co.kr/
- scrapy에서 fake-useragent 사용
- scrapy를 실행할때 아규먼트를 설정해서 실행
- pipelines에서 데이터 베이스로 데이터를 저장

In [19]:
import scrapy
import requests
from scrapy.http import TextResponse

#### 1. 프로젝트 생성

In [20]:
!rm -rf playnomore
!scrapy startproject playnomore

New Scrapy project 'playnomore', using template directory '/Users/yeonghwanchoi/opt/anaconda3/lib/python3.7/site-packages/scrapy/templates/project', created in:
    /Users/yeonghwanchoi/Documents/dev/TIL/scrapy/playnomore

You can start your first spider with:
    cd playnomore
    scrapy genspider example example.com


#### 2. items.py
- title, price, img, link

In [21]:
%%writefile playnomore/playnomore/items.py
import scrapy

class PlaynomoreItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    img = scrapy.Field()
    link = scrapy.Field()

Overwriting playnomore/playnomore/items.py


#### 3. xpath 확인
- 링크
- 링크 -> 상세페이지(제목, 이미지URL, 가격)
- fake_useragent 설치
    - pip install fake_useragent

In [22]:
from fake_useragent import UserAgent
url = "http://playnomore.co.kr/category/bag/24/"
# headers = { "User-Agent": UserAgent().chrome }
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" }
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8") 
response

<200 http://playnomore.co.kr/category/bag/24/>

In [23]:
# 링크
links = response.xpath(
    '//*[@id="contents"]/div[2]/div/ul/li/div/a/@href'
).extract()
links = list(map(response.urljoin, links))

In [24]:
# 상세페이지 : 제목, 가격, 이미지URL
url = links[0]
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" }
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8") 
response

<200 http://playnomore.co.kr/product/pre-order20off-micro-baguette-yellow-180/573/?cate_no=24&display_group=1>

In [25]:
title1 = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()'
    ).extract()
title2 = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()'
    ).extract()
title = "".join(title1) + "".join(title2)
price = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()'
    ).extract()[0]
img = "http:" + response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src'
    ).extract()[0]
title, price, img

('[Pre-Order/20%off] MICRO BAGUETTE yellow ',
 '$ 144',
 'http://playnomore.co.kr/web/product/big/20200407/626de3d4081df6e1dff057623e9e4877.jpg')

#### 4. spider.py
- scrapy-fake-useragent 설치
    - pip install scrapy-fake-useragent

In [26]:
!pip list | grep fake

fake-useragent                     0.1.11             
scrapy-fake-useragent              1.2.0              


In [27]:
%%writefile playnomore/playnomore/spiders/spider.py
import scrapy
from playnomore.items import PlaynomoreItem

class PlaynomoreSpider(scrapy.Spider):
    name = "Playnomore"
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        }
    }
    
    def start_requests(self):
        url = "http://playnomore.co.kr/category/bag/24/"
        yield scrapy.Request(url, callback=self.parse)
        
    def parse(self, response):
        links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div/a/@href').extract()
        links = list(map(response.urljoin, links))
        for link in links:
            yield scrapy.Request(link, callback=self.page_parse)
    
    def page_parse(self, response):
        item = PlaynomoreItem()
        title1 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()').extract()
        title2 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
        item["title"] = "".join(title1) + "".join(title2)
        item["price"] = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()').extract()[0]
        item["img"] = "http:" + response.xpath('//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src').extract()[0]
        item["link"] = response.url
        yield item

Writing playnomore/playnomore/spiders/spider.py


In [28]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore.csv

Overwriting run.sh


In [29]:
!chmod +x run.sh

In [30]:
!./run.sh

2020-07-15 15:33:25 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: playnomore)
2020-07-15 15:33:25 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 13:42:34) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.5.0-x86_64-i386-64bit
2020-07-15 15:33:25 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-07-15 15:33:25 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'playnomore',
 'NEWSPIDER_MODULE': 'playnomore.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['playnomore.spiders']}
2020-07-15 15:33:25 [scrapy.extensions.telnet] INFO: Telnet Password: bc5370526129bec8
2020-07-15 15:33:25 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.M

2020-07-15 15:33:40 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-15 15:33:40 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlew

2020-07-15 15:33:42 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/new-open-event-10-rainbow-hobo-whtie-180/563/?cate_no=24&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/20200317/924aa77ebf25941ab05dbd66f656d0c2.jpg',
 'link': 'http://playnomore.co.kr/product/new-open-event-10-rainbow-hobo-whtie-180/563/?cate_no=24&display_group=1',
 'price': '$ 162',
 'title': '[New Open Event 10%] RAINBOW HOBO whtie '}
2020-07-15 15:33:42 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/new-open-event-10-rainbow-hobo-ice-grey-180/565/?cate_no=24&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/20200317/8b360e875520ffebcfb628d0137f3273.jpg',
 'link': 'http://playnomore.co.kr/product/new-open-event-10-rainbow-hobo-ice-grey-180/565/?cate_no=24&display_group=1',
 'price': '$ 162',
 'title': '[New Open Event 10%] RAINBOW HOBO ice grey '}
2020-07-15 15:33:42 [scrapy.spidermiddlewares.httperror] INFO: Ignoring

In [31]:
import pandas as pd
df = pd.read_csv("playnomore/playnomore.csv")
df.tail(1)

Unnamed: 0,img,link,price,title
13,http://playnomore.co.kr/web/product/big/202004...,http://playnomore.co.kr/product/pre-order20off...,$ 144,[Pre-Order/20%off] MICRO BAGUETTE white


#### 5. argument 설정

In [32]:
%%writefile playnomore/playnomore/spiders/spider.py
import scrapy
from playnomore.items import PlaynomoreItem

class PlaynomoreSpider(scrapy.Spider):
    name = "Playnomore"
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        }
    }
    
    def __init__(self, category1="bag", category2=24, **kwargs):
        self.start_url = "http://playnomore.co.kr/category/{}/{}/".format(category1, category2)
        super().__init__(**kwargs)
        
    def start_requests(self):
        url = self.start_url
        yield scrapy.Request(url, callback=self.parse)
        
    def parse(self, response):
        links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div/a/@href').extract()
        links = list(map(response.urljoin, links))
        for link in links:
            yield scrapy.Request(link, callback=self.page_parse)
    
    def page_parse(self, response):
        item = PlaynomoreItem()
        title1 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()').extract()
        title2 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
        item["title"] = "".join(title1) + "".join(title2)
        item["price"] = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()').extract()[0]
        item["img"] = "http:" + response.xpath('//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src').extract()[0]
        item["link"] = response.url
        yield item

Overwriting playnomore/playnomore/spiders/spider.py


In [33]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore.csv -a category1=shoes -a category2=25




Overwriting run.sh


In [34]:
!./run.sh

2020-07-15 15:33:43 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: playnomore)
2020-07-15 15:33:43 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 13:42:34) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.5.0-x86_64-i386-64bit
2020-07-15 15:33:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-07-15 15:33:43 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'playnomore',
 'NEWSPIDER_MODULE': 'playnomore.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['playnomore.spiders']}
2020-07-15 15:33:43 [scrapy.extensions.telnet] INFO: Telnet Password: 667a6f3d6091f34a
2020-07-15 15:33:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.M

2020-07-15 15:33:46 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-shygirl-oxford-silver/232/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/201703/232_shop7_426702.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-shygirl-oxford-silver/232/?cate_no=25&display_group=1',
 'price': '$ 450',
 'title': '[SOLD OUT] SHYGIRL oxford silver'}
2020-07-15 15:33:46 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-shygirl-oxford-gold/233/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/201703/233_shop7_194677.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-shygirl-oxford-gold/233/?cate_no=25&display_group=1',
 'price': '$ 450',
 'title': '[SOLD OUT] SHYGIRL oxford gold'}
2020-07-15 15:33:46 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-5cm-rose-gold/18/?cate_no=25&display_group=1>
{

In [35]:
import pandas as pd
df = pd.read_csv("playnomore/playnomore2.csv")
df.tail(1)

FileNotFoundError: [Errno 2] File playnomore/playnomore2.csv does not exist: 'playnomore/playnomore2.csv'

#### 6. Mongodb에 저장
- pymongo를 pipelines.py에 적용
- pip install pymongo==2.8.1

In [43]:
!pip list | grep pymongo

pymongo                            3.10.1             


In [44]:
import pymongo

In [45]:
client = pymongo.MongoClient('mongodb://13.125.153.195:27017/')
client

MongoClient(host=['13.125.153.195:27017'], document_class=dict, tz_aware=False, connect=True)

In [46]:
db = client.playnomore
collection = db.shoes
collection

Collection(Database(MongoClient(host=['13.125.153.195:27017'], document_class=dict, tz_aware=False, connect=True), 'playnomore'), 'shoes')

In [47]:
data = {"title":"신발"}
collection.insert(data)

  


ServerSelectionTimeoutError: 13.125.153.195:27017: timed out

##### Mongodb 모듈 파일 생성

In [None]:
%%writefile playnomore/playnomore/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://13.125.153.195:27017/')
db = client.playnomore
collection = db.shoes

In [None]:
%%writefile playnomore/playnomore/pipelines.py
from .mongodb import collection

class PlaynomorePipeline(object):
    
    def process_item(self, item, spider):
        
        data = { "title": item["title"], 
                 "price": item["price"],
                 "img": item["img"], 
                 "link": item["link"],
               }
        
        collection.insert(data)
        
        return item

In [None]:
!echo "ITEM_PIPELINES = {"  >> playnomore/playnomore/settings.py

In [None]:
!echo "   'playnomore.pipelines.PlaynomorePipeline': 300," >> playnomore/playnomore/settings.py

In [41]:
!echo "}" >> playnomore/playnomore/settings.py

In [42]:
!tail -n 5 playnomore/playnomore/settings.py

}
   'playnomore.pipelines.PlaynomorePipeline': 300,
ITEM_PIPELINES = {
   'playnomore.pipelines.PlaynomorePipeline': 300,
}


In [None]:
!cat run.sh

In [None]:
!./run.sh