### Playnomore
- 가방, 옷, 악세사리 판매하는 쇼핑몰
- http://playnomore.co.kr/

In [1]:
import scrapy
import requests
from scrapy.http import TextResponse
from fake_useragent import UserAgent

In [2]:
# 1. 프로젝트 생성

In [3]:
!scrapy startproject playnomore

New Scrapy project 'playnomore', using template directory '/Users/yeonghwanchoi/opt/anaconda3/lib/python3.7/site-packages/scrapy/templates/project', created in:
    /Users/yeonghwanchoi/Downloads/playnomore

You can start your first spider with:
    cd playnomore
    scrapy genspider example example.com


In [4]:
# 2. items.py 코드 작성

In [5]:
%%writefile playnomore/playnomore/items.py
import scrapy

class PlaynomoreItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    img = scrapy.Field()
    link = scrapy.Field()

Overwriting playnomore/playnomore/items.py


In [6]:
# 3. xpath 찾기 : 링크, 상품 데이터

In [7]:
# 링크

In [8]:
url = "http://playnomore.co.kr/category/bag/24/"
# headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
headers = {"User-Agent": UserAgent().chrome}
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8")
response

<200 http://playnomore.co.kr/category/bag/24/>

In [9]:
links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div[1]/a/@href').extract()
links = list(map(lambda data:response.urljoin(data), links))
links[:3]

['http://playnomore.co.kr/product/detail.html?product_no=573&cate_no=24&display_group=1',
 'http://playnomore.co.kr/product/detail.html?product_no=572&cate_no=24&display_group=1',
 'http://playnomore.co.kr/product/detail.html?product_no=550&cate_no=24&display_group=1']

In [10]:
# 상세 데이터

In [11]:
url = "http://playnomore.co.kr/product/pre-order20off-micro-baguette-yellow-180/573/?cate_no=24&display_group=1"
headers = {"User-Agent": UserAgent().chrome}
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8")
response

<200 http://playnomore.co.kr/product/pre-order20off-micro-baguette-yellow-180/573/?cate_no=24&display_group=1>

In [12]:
title1 = response.xpath(
    '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()')[0].extract()
title2 = response.xpath(
    '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
title = title1 + " ".join(title2)
price = response.xpath(
    '//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()')[0].extract()
img = "http:" + response.xpath(
    '//*[@id="contents"]/div[1]/div[1]/div[1]/div[2]/ul/li[1]/img/@src')[0].extract()

title, price, img

('[Pre-Order/20%off]  MICRO BAGUETTE  yellow ',
 '$ 144',
 'http://playnomore.co.kr/web/product/small/20200407/43ec3b61d30531a05a3d727780c9b1f3.jpg')

In [13]:
# 4. spider.py 코드 작성 : fake user agent 설정

In [14]:
!pip install scrapy-fake-useragent



In [32]:
%%writefile playnomore/playnomore/spiders/spiders.py
import scrapy
from playnomore.items import PlaynomoreItem

class PlaynomoreSpider(scrapy.Spider):
    name = "Playnomore"
    custom_settings = {
        "DOWNLOADER_MIDDLEWARES": {
            "scrapy.downloadmiddlewares.useragent.UserAgentMiddleware": None,
            "scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
        }
    }
    
    def __init__(self,category1="",category2="", **kwargs):
        self.start_urls=["http://playnomore.co.kr/category/{}/{}/".format(category1,category2)]
        super().__init__(**kwargs)
        
    def start_requests(self):
        urls = self.start_urls
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)
    
    def parse(self, response):
        links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div[1]/a/@href').extract()
        links = list(map(lambda data:response.urljoin(data), links))
        for link in links:
            yield scrapy.Request(link, callback=self.parse_content)
            
    def parse_content(self, response):
        item = PlaynomoreItem()
        title1 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()')[0].extract()
        title2 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
        item["title"] = title1 + " ".join(title2)
        item["price"] = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()')[0].extract()
        item["img"] = "http:" + response.xpath('//*[@id="contents"]/div[1]/div[1]/div[1]/div[2]/ul/li[1]/img/@src')[0].extract()
        item["link"] = response.url
        yield item

Overwriting playnomore/playnomore/spiders/spiders.py


In [27]:
# 5. 스크래피 실행시 아규먼트 설정

In [28]:
#bag/24/
#shoes/25/
#beauty/28/

In [39]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore.csv -a category1=shoes -a category2=25

Overwriting run.sh


In [30]:
# 6. 스크레피 실행

In [36]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore.csv

Overwriting run.sh


In [40]:
!source run.sh

2020-07-10 15:16:46 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: playnomore)
2020-07-10 15:16:46 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 13:42:34) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.5.0-x86_64-i386-64bit
2020-07-10 15:16:46 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-07-10 15:16:46 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'playnomore',
 'NEWSPIDER_MODULE': 'playnomore.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['playnomore.spiders']}
2020-07-10 15:16:46 [scrapy.extensions.telnet] INFO: Telnet Password: 437d0af2bff87108
2020-07-10 15:16:46 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.M

2020-07-10 15:16:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-5cm-multi/19/?cate_no=25&display_group=1> (referer: http://playnomore.co.kr/category/shoes/25/)
2020-07-10 15:16:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-shygirl-oxford-silver/232/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/small/201703/232_shop7_426702.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-shygirl-oxford-silver/232/?cate_no=25&display_group=1',
 'price': '$ 450',
 'title': '[SOLD OUT]  SHYGIRL oxford  silver'}
2020-07-10 15:16:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-5cm-rose-gold/18/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/small/201702/18_shop7_183931.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-5cm-rose-gold/18/?cate_n

In [42]:
df = pd.read_csv("playnomore/playnomore.csv")
df.tail(2)

Unnamed: 0,img,link,price,title
28,http://playnomore.co.kr/web/product/small/2017...,http://playnomore.co.kr/product/sold-out-shy-f...,$ 450,[SOLD OUT] SHY FRIENDS oxford black
29,http://playnomore.co.kr/web/product/small/2017...,http://playnomore.co.kr/product/sold-out-shygi...,$ 450,[SOLD OUT] SHYGIRL oxford gold


In [None]:
#이미지 저장


In [48]:
link= df.loc[0,"img"]

In [50]:
with open("test.png","wb") as f:
    headers = {"User-Agent": UserAgent().chrome}
    response = requests.get(link, headers=headers)
    f.write(response.content)