# Scrapy from Jupyter Notebook

In [20]:
# scrape webpage
import scrapy
from scrapy.crawler import CrawlerRunner
# text cleaning
import re
# Reactor restart
from crochet import setup, wait_for
setup()

In [21]:
class QuotesToCsv(scrapy.Spider):
    """scrape first line of  quotes from ```wikiquote``` by 
    Maynerd James Keenan and save to json file"""
    name = "MJKQuotesToCsv"
    start_urls = [
        'https://en.wikiquote.org/wiki/Maynard_James_Keenan',
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.ExtractFirstLine': 1
        },
        'FEEDS': {
            'quotes.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }

    def parse(self, response):
        """parse data from urls"""
        for quote in response.css('div.mw-parser-output > ul > li'):
            yield {'quote': quote.extract()}


class ExtractFirstLine(object):
    def process_item(self, item, spider):
        """text processing"""
        lines = dict(item)["quote"].splitlines()
        first_line = self.__remove_html_tags__(lines[0])

        return {'quote': first_line}

    def __remove_html_tags__(self, text):
        """remove html tags from string"""
        html_tags = re.compile('<.*?>')
        return re.sub(html_tags, '', text)


@wait_for(10)
def run_spider():
    """run spider with MJKQuotesToCsv"""
    crawler = CrawlerRunner()
    d = crawler.crawl(QuotesToCsv)
    return d


In [22]:
run_spider()

In [1]:
!python scrape_webpage.py

2021-07-25 03:45:07 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-07-25 03:45:07 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.5 | packaged by conda-forge | (default, Sep 24 2020, 16:20:24) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.1.1, Platform Windows-10-10.0.19041-SP0
2021-07-25 03:45:07 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-07-25 03:45:07 [scrapy.crawler] INFO: Overridden settings:
{}
2021-07-25 03:45:07 [scrapy.extensions.telnet] INFO: Telnet Password: 3779cc9288832a55
2021-07-25 03:45:07 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2021-07-25 03:45:07 [scrapy.middleware] INFO: Enabled downloader mid

# to-do list

- change output behaviour
    - save only current parsed data
        - settings
        - custom functions (with open(... ...)
        - or init call (pipeline(... ...)
- class argument assignment
- explain what crochet is doing in @wait_for()
- loop through urls (multiple url)
- add more pipelines
- change dump file behaviour
