In [1]:
# Add '!' only if you are running this command on a notebook 
## It tells Jupyter that the command should be interpreted as bash command
!pip install Scrapy



In [2]:
# Import os => Library used to easily manipulate operating systems
## More info => https://docs.python.org/3/library/os.html
import os 

# Import logging => Library used for logs manipulation 
## More info => https://docs.python.org/3/library/logging.html
import logging

# Import scrapy and scrapy.crawler 
import scrapy
from scrapy.crawler import CrawlerProcess

In [3]:
class bookingspider(scrapy.Spider):
    # Name of your spider
    name = "bookingspider"

    # Url to start your spider from 
    start_urls = [
        'https://www.booking.com/searchresults.fr.html?ss=paris',
    ]

    # Callback function that will be called when starting your spider
    # It will get text, author and tags of the first <div> with class="quote"
    def parse(self, response):
        print(response.body.decode("utf-8"))
        return {
            'text': response.xpath("/html/body/div[2]/div[2]/div[2]/div[2]/h3/a/text()").get()
        }

In [None]:
/html/body/div[2]/div[2]/div[2]/div[3]/h3/a

In [4]:
# Name of the file where the results will be saved
filename = "hotel.json"

# If file already exists, delete it before crawling (because Scrapy will 
# concatenate the last and new results otherwise)
if filename in os.listdir():
        os.remove(filename)

# Declare a new CrawlerProcess with some settings
## USER_AGENT => Simulates a browser on an OS
## LOG_LEVEL => Minimal Level of Log 
## FEEDS => Where the file will be stored 
## More info on built-in settings => https://docs.scrapy.org/en/latest/topics/settings.html?highlight=settings#settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename : {"format": "json"},
    }
})

# Start the crawling using the spider you defined above
process.crawl(bookingspider)
process.start()

2021-06-16 14:56:25 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-06-16 14:56:26 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.6 | packaged by conda-forge | (default, Oct  7 2020, 19:08:05) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Linux-5.4.89+-x86_64-with-glibc2.10
2021-06-16 14:56:26 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2021-06-16 14:56:26 [scrapy.extensions.telnet] INFO: Telnet Password: 8491136acf986155
2021-06-16 14:56:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2021-06-16 14:56:26 [scrapy.middleware] INFO: Enabl

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr">
<head>
<meta http-equiv="content-language" content="fr" />
<link rel="alternate" hreflang="en-gb" href="/searchresults.en-gb.html?label=gen173nr-1FCAQoggJCDHNlYXJjaF9wYXJpc0gNWARoFYgBAZgBDbgBGMgBDtgBAegBAfgBA4gCAagCBLgCmqKohgbAAgHSAiQ2NjY3NWYxYS03OWYxLTRhYzMtOTdlOS0wNjg1ODQzNTBkMDHYAgXgAgE;sid=69c9321a1f75f13e02ddd6f75a8b9cba;tmpl=searchresults;class_interval=1;dtdisc=0;inac=0;index_postcard=0;label_click=undef;offset=0;postcard=0;room1=A%2CA;sb_price_type=total;shw_aparth=1;slp_r_match=0;srpvid=45f8690d819a019e;ss=paris;ss_all=0;ssb=empty;sshis=0;top_ufis=1;sig=v1TxZ6w_BP" title="English (UK)" />
<link rel="alternate" hreflang="en-us" href="/searchresults.en-us.html?label=gen173nr-1FCAQoggJCDHNlYXJjaF9wYXJpc0gNWARoFYgBAZgBDbgBGMgBDtgBAegBAfgBA4gCAagCBLgCmqKohgbAAgHSAiQ2NjY3NWYxYS03OWYxLTRhYzMtOTdlOS0wNjg1ODQzNTB