# **Link of the colab notebook :**
https://colab.research.google.com/drive/1MH6rWWxrxbIPvwpF8zwcnZy53DTEi_oB?usp=sharing

# **Link of the website**
https://books.toscrape.com/

In [11]:
# Importing required libraries
import requests as rq
from bs4 import BeautifulSoup as bsoup
import pandas as pd_frame

In [12]:
# Fetching webpage content
web_response = rq.get("https://books.toscrape.com/")
web_response
web_response.text



In [13]:
# Parsing HTML content
soup_obj = bsoup(web_response.text, 'html.parser')
print(soup_obj.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [14]:
# Finding all book elements
book_items = soup_obj.find_all(class_='product_pod')
book_items

[<article class="product_pod">
 <div class="image_container">
 <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="catalogue/tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="th

In [15]:
# Iterating through book items and extracting details
for item in book_items:
    book_title = item.find('h3').find('a')['title']
    book_price = item.find('p', class_='price_color').text.strip()
    stock_status = item.find('p', class_='instock availability').text.strip()

    rating_elem = item.find('p', class_='star-rating')
    book_rating = rating_elem['class'][1]

    print(f"Title: {book_title}")
    print(f"Price: {book_price}")
    print(f"Availability: {stock_status}")
    print(f"Rating: {book_rating} stars")
    print('---')

Title: A Light in the Attic
Price: Â£51.77
Availability: In stock
Rating: Three stars
---
Title: Tipping the Velvet
Price: Â£53.74
Availability: In stock
Rating: One stars
---
Title: Soumission
Price: Â£50.10
Availability: In stock
Rating: One stars
---
Title: Sharp Objects
Price: Â£47.82
Availability: In stock
Rating: Four stars
---
Title: Sapiens: A Brief History of Humankind
Price: Â£54.23
Availability: In stock
Rating: Five stars
---
Title: The Requiem Red
Price: Â£22.65
Availability: In stock
Rating: One stars
---
Title: The Dirty Little Secrets of Getting Your Dream Job
Price: Â£33.34
Availability: In stock
Rating: Four stars
---
Title: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
Price: Â£17.93
Availability: In stock
Rating: Three stars
---
Title: The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
Price: Â£22.60
Availability: In stock
Rating: Four stars
---
Title: The Black Maria
Price: Â£52.1

In [16]:
# Importing libraries again
import requests as rqst
from bs4 import BeautifulSoup as soup_module

base_url_template = 'http://books.toscrape.com/catalogue/page-{}.html'

In [17]:
# Loop through pages
for num in range(1, 51):
    formatted_url = base_url_template.format(num)
    page_response = rqst.get(formatted_url)

    parsed_soup = soup_module(page_response.content, 'html.parser')
    book_entries = parsed_soup.find_all(class_='product_pod')

    for entry in book_entries:
        title_name = entry.find('h3').find('a')['title']
        price_tag = entry.find('p', class_='price_color').text.strip()
        stock_level = entry.find('p', class_='instock availability').text.strip()
        rating_class = entry.find('p', class_='star-rating')
        rating_value = rating_class['class'][1]

        print(f"Page {num} - Title: {title_name}")
        print(f"Price: {price_tag}")
        print(f"Availability: {stock_level}")
        print(f"Rating: {rating_value} stars")
        print('---')

    print(f"Completed scraping page {num}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Page 1 - Title: Starving Hearts (Triangular Trade Trilogy, #1)
Price: £13.99
Availability: In stock
Rating: Two stars
---
Page 1 - Title: Shakespeare's Sonnets
Price: £20.66
Availability: In stock
Rating: Four stars
---
Page 1 - Title: Set Me Free
Price: £17.46
Availability: In stock
Rating: Five stars
---
Page 1 - Title: Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Price: £52.29
Availability: In stock
Rating: Five stars
---
Page 1 - Title: Rip it Up and Start Again
Price: £35.02
Availability: In stock
Rating: Five stars
---
Page 1 - Title: Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Price: £57.25
Availability: In stock
Rating: Three stars
---
Page 1 - Title: Olio
Price: £23.88
Availability: In stock
Rating: One stars
---
Page 1 - Title: Mesaerion: The Best Science Fiction Stories 1800-1849
Price: £37.59
Availability: In stock
Rating: One stars
---
Page 1 - Title: Libertar

In [18]:
# Writing to CSV
import requests as req_lib
from bs4 import BeautifulSoup as parse_html
import csv as csv_lib

base_url_csv = 'http://books.toscrape.com/catalogue/page-{}.html'

In [19]:
# Open CSV file for writing
with open('scraped_books_data.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv_lib.writer(csv_file)
    csv_writer.writerow(['Title', 'Price', 'Availability', 'Rating', 'Page Number'])

    for page_id in range(1, 51):
        page_url_csv = base_url_csv.format(page_id)
        page_res = req_lib.get(page_url_csv)
        soup_page = parse_html(page_res.content, 'html.parser')

        book_elements = soup_page.find_all(class_='product_pod')

        for book_elem in book_elements:
            book_name = book_elem.find('h3').find('a')['title']
            book_cost = book_elem.find('p', class_='price_color').text.strip()
            availability_status = book_elem.find('p', class_='instock availability').text.strip()

            rating_node = book_elem.find('p', class_='star-rating')
            rating_text = rating_node['class'][1]

            csv_writer.writerow([book_name, book_cost, availability_status, rating_text, page_id])

        print(f"Finished writing data for page {page_id}")

print("Scraping complete. Data written to 'scraped_books_data.csv'.")

Finished writing data for page 1
Finished writing data for page 2
Finished writing data for page 3
Finished writing data for page 4
Finished writing data for page 5
Finished writing data for page 6
Finished writing data for page 7
Finished writing data for page 8
Finished writing data for page 9
Finished writing data for page 10
Finished writing data for page 11
Finished writing data for page 12
Finished writing data for page 13
Finished writing data for page 14
Finished writing data for page 15
Finished writing data for page 16
Finished writing data for page 17
Finished writing data for page 18
Finished writing data for page 19
Finished writing data for page 20
Finished writing data for page 21
Finished writing data for page 22
Finished writing data for page 23
Finished writing data for page 24
Finished writing data for page 25
Finished writing data for page 26
Finished writing data for page 27
Finished writing data for page 28
Finished writing data for page 29
Finished writing data f

In [20]:
# Reading and cleaning data
df_books = pd_frame.read_csv('scraped_books_data.csv')
df_books

Unnamed: 0,Title,Price,Availability,Rating,Page Number
0,A Light in the Attic,£51.77,In stock,Three,1
1,Tipping the Velvet,£53.74,In stock,One,1
2,Soumission,£50.10,In stock,One,1
3,Sharp Objects,£47.82,In stock,Four,1
4,Sapiens: A Brief History of Humankind,£54.23,In stock,Five,1
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,£55.53,In stock,One,50
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",£57.06,In stock,Four,50
997,A Spy's Devotion (The Regency Spies of London #1),£16.97,In stock,Five,50
998,1st to Die (Women's Murder Club #1),£53.98,In stock,One,50


In [21]:
# Displaying dataframe information
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         1000 non-null   object
 1   Price         1000 non-null   object
 2   Availability  1000 non-null   object
 3   Rating        1000 non-null   object
 4   Page Number   1000 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [22]:
# Convert 'Price' column to string type
df_books['Price'] = df_books['Price'].astype(str)

In [23]:
# Function to remove currency symbol and convert to float
def convert_price(val):
    return float(val[1:])

df_books['Price'] = df_books['Price'].apply(convert_price)
df_books

Unnamed: 0,Title,Price,Availability,Rating,Page Number
0,A Light in the Attic,51.77,In stock,Three,1
1,Tipping the Velvet,53.74,In stock,One,1
2,Soumission,50.10,In stock,One,1
3,Sharp Objects,47.82,In stock,Four,1
4,Sapiens: A Brief History of Humankind,54.23,In stock,Five,1
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,In stock,One,50
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,In stock,Four,50
997,A Spy's Devotion (The Regency Spies of London #1),16.97,In stock,Five,50
998,1st to Die (Women's Murder Club #1),53.98,In stock,One,50


In [24]:
# Convert 'Rating' from words to numbers
def map_rating(value):
    return 1 if value == 'One' else 2 if value == 'Two' else 3 if value == 'Three' else 4 if value == 'Four' else 5

df_books['Rating'] = df_books['Rating'].apply(map_rating)
df_books

Unnamed: 0,Title,Price,Availability,Rating,Page Number
0,A Light in the Attic,51.77,In stock,3,1
1,Tipping the Velvet,53.74,In stock,1,1
2,Soumission,50.10,In stock,1,1
3,Sharp Objects,47.82,In stock,4,1
4,Sapiens: A Brief History of Humankind,54.23,In stock,5,1
...,...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,55.53,In stock,1,50
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,In stock,4,50
997,A Spy's Devotion (The Regency Spies of London #1),16.97,In stock,5,50
998,1st to Die (Women's Murder Club #1),53.98,In stock,1,50


In [25]:
# Convert 'Rating' to integer type
df_books['Rating'] = df_books['Rating'].astype(int)
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1000 non-null   object 
 1   Price         1000 non-null   float64
 2   Availability  1000 non-null   object 
 3   Rating        1000 non-null   int64  
 4   Page Number   1000 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


In [26]:
# Save cleaned dataframe to CSV
df_books.to_csv('cleaned_books_data.csv', index=False)