## Web Scraping Program

Design a web scraping program that retrieves the contents from an e-Commerce store that sells different tablet/laptop models in the following web scraping testing websites: https://webscraper.io/test-sites/e-commerce/allinone/computers/tablets and https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops </br>

Collect information for all the tablets and laptops listed on the webpage, including its product, description, price and review information.

In [1]:
# importing the library

import lxml.etree

import json

import warnings
warnings.filterwarnings("ignore")

# define a user-defined class ConsoleWriterPipeline(object) that receives the extract result from the spider 
# and prints out the content

class ConsoleWriterPipeline(object):
    def open_spider(self, spider):
        None
    def close_spdier(self, spider):
        None
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        print(line)
        return item 

In [4]:
# importing the library

import logging
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor

# Define the spider, TabletsCrawlSpider(CrawlSpider).

# Define the start Url

# 'https://webscraper.io/test-sites/e-commerce/allinone/computers/tablets'

# Set the rule for the parsing in the URL

# Rule(LinkExtractor(allow=('/test-sites/e-commerce/allinone/product/')), 'parse_tablets_page')


class TabletsCrawlSpider(CrawlSpider):
    name = 'tablets-crawlspider'
    allowed_domains = ['webscraper.io']
    start_urls = ['https://webscraper.io/test-sites/e-commerce/allinone/computers/tablets']
    custom_settings = {
      'LOG_LEVEL': logging.WARNING,
      'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1} #, # Used for pipelines 1 and 2
      }
    rules = [
        Rule(
            LinkExtractor(allow=('/test-sites/e-commerce/allinone/product/')), 'parse_tablets_page'
        ),
    ]

    def parse_tablets_page(self, response):
        yield {
            #'type': response.css('a.subcategory-link.active::text').get(), response.css('a.subcategory-link.active ::attr(href)').extract()
            'description': response.css('p.description::text').get(),
            'price': response.css('h4::text').get(),
            'product':' '.join(response.css('h4::text')[1].re('[a-zA-Z0-9]+')),
            'review': ' '.join(response.css('.ratings ::text').re('[a-zA-Z0-9]+'))
        }

In [5]:
# importing the library

from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher

# create a process which will start the crawler for tablets. 

def spider_tablet_results():
    
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_scraped)

    hgw_crawler_process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)' # User Agent is the runner that execute the crawling process.
})
    hgw_crawler_process.crawl(TabletsCrawlSpider)
    
    hgw_crawler_process.start()  # the script will block here until the crawling is finished
    
    return results

# Print the output in JSON format

if __name__ == '__main__':
    json_tablet = json.dumps(spider_tablet_results(), indent=4)
    print(json_tablet)

2022-06-25 05:03:06 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-06-25 05:03:06 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.7.13 (default, Apr 24 2022, 01:04:09) - [GCC 7.5.0], pyOpenSSL 22.0.0 (OpenSSL 3.0.3 3 May 2022), cryptography 37.0.2, Platform Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic
2022-06-25 05:03:06 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


{"description": "7\" screen, Android", "price": "$69.99", "product": "Lenovo IdeaTab", "review": "7 reviews"}

{"description": "16GB, White", "price": "$251.99", "product": "Galaxy Tab", "review": "14 reviews"}

{"description": "Wi-Fi + Cellular, 32GB, Silver", "price": "$537.99", "product": "iPad Mini Retina", "review": "8 reviews"}

{"description": "12.2\", 32GB, WiFi, Android 4.4, White", "price": "$489.99", "product": "Galaxy Note", "review": "9 reviews"}

{"description": "10.1\", 32GB, Black", "price": "$587.99", "product": "Galaxy Note 10 1", "review": "6 reviews"}

{"description": "White, 10.1\" IPS, 1.6GHz, 2GB, 16GB, Android 4.2", "price": "$320.99", "product": "MeMo PAD FHD 10", "review": "7 reviews"}

{"description": "Wi-Fi, 64GB, Silver", "price": "$603.99", "product": "Apple iPad Air", "review": "7 reviews"}

{"description": "10.1\", 3G, Android 4.0, Garnet Red", "price": "$399.99", "product": "Galaxy Note", "review": "12 reviews"}

{"description": "LTE (SM-T235), Quad-Cor

### b) Search tablets’ information based on the review.

### Function name: SearchbyReview( int review); Argument review: int

### Return result: list of all matching items with (type, product, description, price, reviews) that have review greater than or equal to the function argument review. The list needs to be sorted base on the reviews in the descending order.

In [6]:
# importing the library

import pandas as pd

# Convert the JSON output into a dataframe using pd.read_json() and assign it to df_tablet

df_tablet = pd.read_json(json_tablet)
df_tablet

Unnamed: 0,description,price,product,review
0,"7"" screen, Android",$69.99,Lenovo IdeaTab,7 reviews
1,"16GB, White",$251.99,Galaxy Tab,14 reviews
2,"Wi-Fi + Cellular, 32GB, Silver",$537.99,iPad Mini Retina,8 reviews
3,"12.2"", 32GB, WiFi, Android 4.4, White",$489.99,Galaxy Note,9 reviews
4,"10.1"", 32GB, Black",$587.99,Galaxy Note 10 1,6 reviews
5,"White, 10.1"" IPS, 1.6GHz, 2GB, 16GB, Android 4.2",$320.99,MeMo PAD FHD 10,7 reviews
6,"Wi-Fi, 64GB, Silver",$603.99,Apple iPad Air,7 reviews
7,"10.1"", 3G, Android 4.0, Garnet Red",$399.99,Galaxy Note,12 reviews
8,"LTE (SM-T235), Quad-Core 1.2GHz, 8GB, Black",$233.99,Galaxy Tab 4,1 reviews
9,"Silver, 7"" IPS, Quad-Core 1.2Ghz, 16GB, 3G, An...",$172.99,IdeaTab S5000,8 reviews


In [7]:
# Create a new column "num_reviews" that extracts the numerical value of the "review" column

df_tablet['num_reviews'] = df_tablet['review'].apply(lambda x: x.replace("reviews", " ")).astype('int')
df_tablet

Unnamed: 0,description,price,product,review,num_reviews
0,"7"" screen, Android",$69.99,Lenovo IdeaTab,7 reviews,7
1,"16GB, White",$251.99,Galaxy Tab,14 reviews,14
2,"Wi-Fi + Cellular, 32GB, Silver",$537.99,iPad Mini Retina,8 reviews,8
3,"12.2"", 32GB, WiFi, Android 4.4, White",$489.99,Galaxy Note,9 reviews,9
4,"10.1"", 32GB, Black",$587.99,Galaxy Note 10 1,6 reviews,6
5,"White, 10.1"" IPS, 1.6GHz, 2GB, 16GB, Android 4.2",$320.99,MeMo PAD FHD 10,7 reviews,7
6,"Wi-Fi, 64GB, Silver",$603.99,Apple iPad Air,7 reviews,7
7,"10.1"", 3G, Android 4.0, Garnet Red",$399.99,Galaxy Note,12 reviews,12
8,"LTE (SM-T235), Quad-Core 1.2GHz, 8GB, Black",$233.99,Galaxy Tab 4,1 reviews,1
9,"Silver, 7"" IPS, Quad-Core 1.2Ghz, 16GB, 3G, An...",$172.99,IdeaTab S5000,8 reviews,8


In [8]:
# define a function SearchbyReview(int_review) that returns a list of matching items with number of reviews more than or 
# equals to the parameter, int_review
# The list needs to be sorted based on the reviews in the descending order.

def SearchbyReview(int_review):
    """returns number of reviews more than or equals to the int_review"""
    return df_tablet[df_tablet['num_reviews'] >= int_review].sort_values(by = ['num_reviews'], ascending = False).drop('num_reviews', axis = 1).values.tolist()

### Run the function with review=8 and review=14. Print the results of each of the review.

In [9]:
# when review = 8
SearchbyReview(8)

[['16GB, White', '$251.99', 'Galaxy Tab', '14 reviews'],
 ['7" screen, Android, 8GB', '$102.99', 'Asus MeMO Pad', '14 reviews'],
 ['7", 8GB, Wi-Fi, Android 4.2, Yellow',
  '$107.99',
  'Galaxy Tab 3',
  '14 reviews'],
 ['Blue, 8" IPS, Quad-Core 1.3GHz, 16GB, Android 4.2',
  '$121.99',
  'IdeaTab A8 50',
  '13 reviews'],
 ['10.1", 3G, Android 4.0, Garnet Red',
  '$399.99',
  'Galaxy Note',
  '12 reviews'],
 ['White, 7", Atom 1.2GHz, 8GB, Android 4.4',
  '$130.99',
  'MeMO Pad 7',
  '11 reviews'],
 ['IPS, Dual-Core 1.2GHz, 8GB, Android 4.3',
  '$101.99',
  'Memo Pad HD 7',
  '10 reviews'],
 ['12.2", 32GB, WiFi, Android 4.4, White',
  '$489.99',
  'Galaxy Note',
  '9 reviews'],
 ['Blue, 7" IPS, Quad-Core 1.3GHz, 8GB, 3G, Android 4.2',
  '$148.99',
  'IdeaTab A3500 H',
  '9 reviews'],
 ['Wi-Fi + Cellular, 32GB, Silver',
  '$537.99',
  'iPad Mini Retina',
  '8 reviews'],
 ['Silver, 7" IPS, Quad-Core 1.2Ghz, 16GB, 3G, Android 4.2',
  '$172.99',
  'IdeaTab S5000',
  '8 reviews']]

In [10]:
# when review = 14
SearchbyReview(14)

[['16GB, White', '$251.99', 'Galaxy Tab', '14 reviews'],
 ['7" screen, Android, 8GB', '$102.99', 'Asus MeMO Pad', '14 reviews'],
 ['7", 8GB, Wi-Fi, Android 4.2, Yellow',
  '$107.99',
  'Galaxy Tab 3',
  '14 reviews']]

## Repeat the same for Laptop scrapping

In [4]:
import lxml.etree

import json

# define a user-defined class ConsoleWriterPipeline(object) that receives the extract result from the spider 
# and prints out the content

class ConsoleWriterPipeline(object):
    def open_spider(self, spider):
        None
    def close_spdier(self, spider):
        None
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        return item 

In [5]:
# importing the library

import logging
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor

# Define the spider, LaptopsCrawlSpider(CrawlSpider).

# Define the start Url

# 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'

# Set the rule for the parsing in the URL

# Rule(LinkExtractor(allow=('/test-sites/e-commerce/allinone/product/')), 'parse_tablets_page')

class LaptopsCrawlSpider(CrawlSpider):
    name = 'Laptops-crawlspider'
    allowed_domains = ['webscraper.io']
    start_urls = ['https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops']
    custom_settings = {
      'LOG_LEVEL': logging.WARNING,
      'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}#, # Used for pipeline 1
      }
    rules = [
        Rule(
            LinkExtractor(allow=('/test-sites/e-commerce/allinone/product/')), 'parse_laptops_page'
        ),
    ]

    def parse_laptops_page(self, response):
        yield {
            #'type': response.css('a.subcategory-link.active::attr(href)').extract(),
            'description': response.css('p.description::text').get(),
            'price': response.css('h4::text').get(),
            'product':' '.join(response.css('h4::text')[1].re('[a-zA-Z0-9]+')),
            'review': ' '.join(response.css('.ratings ::text').re('[a-zA-Z0-9]+'))
        }

In [6]:
# importing the library

from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.signalmanager import dispatcher
import sys

# create a process which will start the crawler for laptops. 

def spider_laptops_results():
    results = []

    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    dispatcher.connect(crawler_results, signal=signals.item_scraped)

    hgw_crawler_process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
    hgw_crawler_process.crawl(LaptopsCrawlSpider)
    hgw_crawler_process.start()  # the script will block here until the crawling is finished
    return results

# Print the output in JSON format

if __name__ == '__main__':
    json_laptop = json.dumps(spider_laptops_results(), indent=4)
    print(json_laptop)

2022-06-25 05:05:29 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-06-25 05:05:29 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.7.13 (default, Apr 24 2022, 01:04:09) - [GCC 7.5.0], pyOpenSSL 22.0.0 (OpenSSL 3.0.3 3 May 2022), cryptography 37.0.2, Platform Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic
2022-06-25 05:05:29 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


[
    {
        "description": "Asus VivoBook X441NA-GA190 Chocolate Black, 14\", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd",
        "price": "$295.99",
        "product": "Asus VivoBook X441NA GA190",
        "review": "14 reviews"
    },
    {
        "description": "Acer Aspire A315-51-33TG, Black 15.6\" HD, Core i3-7100U, 4GB DDR4, 128GB SSD, Windows 10 Home, ENG",
        "price": "$457.38",
        "product": "Acer Aspire A315 51 33TG",
        "review": "9 reviews"
    },
    {
        "description": "Prestigio SmartBook 133S Dark Grey, 13.3\" FHD IPS, Celeron N3350 1.1GHz, 4GB, 32GB, Windows 10 Pro + Office 365 1 gadam",
        "price": "$299.00",
        "product": "Prestigio SmartBook 133S Dark Grey",
        "review": "8 reviews"
    },
    {
        "description": "15.6\", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",
        "price": "$306.99",
        "product": "Aspire E1 510",
        "review": "2 reviews"
    },
    {
        "description": "Acer Aspire 3 A315-

In [7]:
# importing the library

import pandas as pd

# Convert the JSON output into a dataframe using pd.read_json() and assign it to df_laptop

df_laptop = pd.read_json(json_laptop)
df_laptop

Unnamed: 0,description,price,product,review
0,"Asus VivoBook X441NA-GA190 Chocolate Black, 14...",$295.99,Asus VivoBook X441NA GA190,14 reviews
1,"Acer Aspire A315-51-33TG, Black 15.6"" HD, Core...",$457.38,Acer Aspire A315 51 33TG,9 reviews
2,"Prestigio SmartBook 133S Dark Grey, 13.3"" FHD ...",$299.00,Prestigio SmartBook 133S Dark Grey,8 reviews
3,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",$306.99,Aspire E1 510,2 reviews
4,"Acer Aspire 3 A315-31 Black, 15.6"" HD, Celeron...",$372.70,Acer Aspire 3 A315 31 Black,2 reviews
...,...,...,...,...
112,Asus VivoBook Max X541NA-GQ041 Black Chocolate...,$399.00,Asus VivoBook Max,4 reviews
113,"Acer Aspire 3 A315-21, 15.6"", AMD A4-9120. 4GB...",$393.88,Acer Aspire 3 A315 21,9 reviews
114,"Acer Aspire ES1-572 Black, 15.6"" HD, Core i3-6...",$379.95,Acer Aspire ES1 572 Black,9 reviews
115,"Acer Aspire A315-31-C33J Black 15.6"", HD, Cele...",$379.94,Acer Aspire A315 31 C33J,0 reviews


In [8]:
# Create a new column "num_reviews" that extracts the numerical value of the "review" column

df_laptop['num_reviews'] = df_laptop['review'].apply(lambda x: x.replace("reviews", " ")).astype('int')
df_laptop

Unnamed: 0,description,price,product,review,num_reviews
0,"Asus VivoBook X441NA-GA190 Chocolate Black, 14...",$295.99,Asus VivoBook X441NA GA190,14 reviews,14
1,"Acer Aspire A315-51-33TG, Black 15.6"" HD, Core...",$457.38,Acer Aspire A315 51 33TG,9 reviews,9
2,"Prestigio SmartBook 133S Dark Grey, 13.3"" FHD ...",$299.00,Prestigio SmartBook 133S Dark Grey,8 reviews,8
3,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",$306.99,Aspire E1 510,2 reviews,2
4,"Acer Aspire 3 A315-31 Black, 15.6"" HD, Celeron...",$372.70,Acer Aspire 3 A315 31 Black,2 reviews,2
...,...,...,...,...,...
112,Asus VivoBook Max X541NA-GQ041 Black Chocolate...,$399.00,Asus VivoBook Max,4 reviews,4
113,"Acer Aspire 3 A315-21, 15.6"", AMD A4-9120. 4GB...",$393.88,Acer Aspire 3 A315 21,9 reviews,9
114,"Acer Aspire ES1-572 Black, 15.6"" HD, Core i3-6...",$379.95,Acer Aspire ES1 572 Black,9 reviews,9
115,"Acer Aspire A315-31-C33J Black 15.6"", HD, Cele...",$379.94,Acer Aspire A315 31 C33J,0 reviews,0


In [9]:
# define a function SearchbyReview(int_review) that returns a list of matching items with number of reviews more than or 
# equals to the parameter, int_review
# The list needs to be sorted based on the reviews in the descending order.

def SearchbyReview(int_review):
    """returns number of reviews more than or equals to the int_review"""
    return df_laptop[df_laptop['num_reviews'] >= int_review].sort_values(by = ['num_reviews'], ascending = False).drop('num_reviews', axis = 1).values.tolist()

In [10]:
# when review = 8
SearchbyReview(8)

[['Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd',
  '$295.99',
  'Asus VivoBook X441NA GA190',
  '14 reviews'],
 ['Acer Aspire ES1-732 Black, 17.3" HD+, Celeron, N3350, 4GB, 1TB, Windows 10 Home',
  '$410.46',
  'Acer Aspire ES1 732 Black',
  '14 reviews'],
 ['Dell Vostro 15 (3568) Black, 15.6" FHD, Core i5-7200U, 4GB, 128GB SSD, Radeon R5 M420 2GB, Linux',
  '$488.78',
  'Dell Vostro 15',
  '14 reviews'],
 ['Lenovo ThinkPad L460, 14" FHD IPS, Core i7-6600U, 8GB, 256GB SSD, Windows 10 Pro',
  '$1096.02',
  'Lenovo ThinkPad L460',
  '14 reviews'],
 ['Dell Latitude 5480, 14" FHD, Core i5-7300U, 8GB, 500GB, Windows 10 Pro',
  '$1133.82',
  'Dell Latitude 5480',
  '14 reviews'],
 ['Acer Nitro 5 AN515-51, 15.6" FHD IPS, Core i7-7700HQ, 8GB, 256GB SSD +1TB, GeForce GTX 1050 Ti 4GB, Windows 10 Home + Windows 10 Home',
  '$1140.62',
  'Acer Nitro 5 AN515 51',
  '14 reviews'],
 ['Acer Predator Helios 300 (PH317-51), 17.3" FHD IPS, Core i7-7

In [11]:
# when review = 14
SearchbyReview(14)

[['Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd',
  '$295.99',
  'Asus VivoBook X441NA GA190',
  '14 reviews'],
 ['Asus ZenBook UX530UX-FY040T Blue, 15.6" FHD, Core i7-7500U, 8GB, 512GB SSD, GeForce GTX950M 2GB, Windows 10 Home, Eng kbd',
  '$1239.20',
  'Asus ZenBook UX530UX FY040T Blue',
  '14 reviews'],
 ['Acer Predator Helios 300 (PH317-51), 17.3" FHD IPS, Core i7-7700HQ. 8GB, 128GB SSD +1TB, GeForce GTX 1050Ti 4GB, Linux + Windows 10 Home',
  '$1187.98',
  'Acer Predator Helios 300 PH317 51',
  '14 reviews'],
 ['Acer Nitro 5 AN515-51, 15.6" FHD IPS, Core i7-7700HQ, 8GB, 256GB SSD +1TB, GeForce GTX 1050 Ti 4GB, Windows 10 Home + Windows 10 Home',
  '$1140.62',
  'Acer Nitro 5 AN515 51',
  '14 reviews'],
 ['Dell Latitude 5480, 14" FHD, Core i5-7300U, 8GB, 500GB, Windows 10 Pro',
  '$1133.82',
  'Dell Latitude 5480',
  '14 reviews'],
 ['Lenovo ThinkPad L460, 14" FHD IPS, Core i7-6600U, 8GB, 256GB SSD, Windows 10 Pro',
  '$1096.02

## End of Notebook