These are the two packages we have to install while doing web scraping

In [83]:
# pip install bs4
# pip install requests

Beautiful Soup is a Python library used for web scraping purposes. It helps you parse HTML and XML documents, navigate through their elements, and extract data based on tags, attributes, and text content.


requests is a Python library that simplifies sending HTTP requests to web servers and handling the responses. Supports various HTTP methods (GET, POST, etc.) and handles common web scenarios (like redirects and cookies) automatically.


In [84]:
# Importing required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

HTTP headers are essential components of communication between web browsers (or any HTTP client)

In [85]:
URL = 'https://www.amazon.in/s?k=Laptops&crid=M5W2AEQVVS42&sprefix=laptops%2Caps%2C223&ref=nb_sb_noss_1'

In [86]:
HEADERS = ({'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
               'Accept-language':'en-US,en;q=0.5'})

a User-Agent is a string of text that identifies the software and its version making an HTTP request, such as a web browser or a script using the requests library in Python. It helps servers understand how to format content and respond appropriately to the requesting software.

In [88]:
webpage = requests.get(URL, headers=HEADERS)
webpage

<Response [200]>

In [89]:
webpage.content

b'<!doctype html><html lang="en-in" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n<!-- sp:feature:csm:head-open-part1 -->\n\n<!-- sp:end-feature:csm:head-open-part1 -->\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com" crossorigin>\n<link rel="preconnect" href="https://images-eu.ssl-images-amazon.com" crossorigin>\n<link rel="dns-prefetch" href="https://m.media-amazon.com" crossorigin>\n<link rel="preconnect" href="https://m.media-amazon.com" crossorigin>\n<link rel="dns-prefetch" href="https://completion.amazon.com" crossorigin>\n<link rel="preconnect" href="https://completion.amazon.com" crossorigin>\n<!-- sp:end-feature:cs-optimization -->\n<!-- sp:feature:csm:head-open-part2 -->\n\n<!-- sp:end-feature:csm:head-open-p

In [91]:
type(webpage.content)

bytes

Now we want to convert this into HTML format

In [92]:
soup = BeautifulSoup(webpage.content, 'html.parser')

In [94]:
# Find_all all of the a tag from our page where the class name is this
Links = soup.find_all('a', attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'}) 
Links

[<a class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal" href="/HP-i5-1235U-15-6-inch-graphics-speakers/dp/B0CTKHTNWL" target="_blank"><span class="a-size-medium a-color-base a-text-normal">HP Laptop 15s, 12th Gen Intel Core i5-1235U, 15.6-inch (39.6 cm), FHD, 16GB DDR4, 512GB SSD, Intel Iris Xe graphics, Backlit KB,MSO,Thin &amp; Light, Dual speakers (Win 11, Silver, 1.69 kg), fq5330TU</span> </a>,
 <a class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal" href="/HP-15-6inch-Micro-Edge-Anti-Glare-15s-Eq2143au/dp/B09R1MMMTH" target="_blank"><span class="a-size-medium a-color-base a-text-normal">HP Laptop 15s, AMD Ryzen 3 5300U, 15.6-inch (39.6 cm), FHD, 8GB DDR4, 512GB SSD, AMD Radeon graphics, Thin &amp; light, Dual speakers (Win 11, MSO 2019, Silver, 1.69 kg), eq2143AU</span> </a>,
 <a class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal" href="/Chuwi-HeroBook-Pro-Celeron-Mini-HDMI/d

In [96]:
# Here i took link of the first product
link = Links[0].get('href')

In [101]:
# Joined with amazon so it will directly take me there
product_list = 'https://www.amazon.in' + link
product_list

'https://www.amazon.in/HP-i5-1235U-15-6-inch-graphics-speakers/dp/B0CTKHTNWL'

In [102]:
# Here i make the request
new_webpage = requests.get(product_list, headers = HEADERS)

In [103]:
new_webpage

<Response [200]>

In [104]:
# we are trying to know how to get the element from this pages
new_soup = BeautifulSoup(new_webpage.content, 'html.parser')
new_soup

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:csm:head-open-part1 -->
<!-- sp:end-feature:csm:head-open-part1 -->
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link crossorigin="" href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://images-eu.ssl-images-amazon.com" rel="preconnect"/>
<link crossorigin="" href="https://m.media-amazon.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://m.media-amazon.com" rel="preconnect"/>
<link crossorigin="" href="https://completion.amazon.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://completion.amazon.com" rel="preconnect"/>
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:csm:head-open-part2 -->
<!-- sp:end-feature:csm:head-open-

In [105]:
new_soup.find('span', attrs = {'id': "productTitle"})

<span class="a-size-large product-title-word-break" id="productTitle">        HP Laptop 15s, 12th Gen Intel Core i5-1235U, 15.6-inch (39.6 cm), FHD, 16GB DDR4, 512GB SSD, Intel Iris Xe graphics, Backlit KB,MSO,Thin &amp; Light, Dual speakers (Win 11, Silver, 1.69 kg), fq5330TU       </span>

In [107]:
# i want to convert this into text so i used .text
new_soup.find('span', attrs = {'id': "productTitle"}).text

'        HP Laptop 15s, 12th Gen Intel Core i5-1235U, 15.6-inch (39.6 cm), FHD, 16GB DDR4, 512GB SSD, Intel Iris Xe graphics, Backlit KB,MSO,Thin & Light, Dual speakers (Win 11, Silver, 1.69 kg), fq5330TU       '

In [110]:
# Again i am getting some spaces from both side so i will use 'strip' function
new_soup.find('span', attrs = {'id': "productTitle"}).text.strip()

'HP Laptop 15s, 12th Gen Intel Core i5-1235U, 15.6-inch (39.6 cm), FHD, 16GB DDR4, 512GB SSD, Intel Iris Xe graphics, Backlit KB,MSO,Thin & Light, Dual speakers (Win 11, Silver, 1.69 kg), fq5330TU'

Here you can check i scrap a title and title contain alot of data like Company name, CPU, GPU, Resolution, RAM, ROM, Processor

In [116]:
# Here i scrap 'Price' of the laptop
new_soup.find('span', attrs = {'class': "a-price-whole"}).text

'52,490.'

In [126]:
# Here i scrap 'Company name' of the laptop
new_soup.find('span', attrs = {'class': "a-size-base po-break-word"}).text

'HP'

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'})
        title_string = title.text.strip() if title else "Not Available"
    except AttributeError:
        title_string = "Not Available"
    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'id': 'priceblock_ourprice'}).string.strip()
    except AttributeError:
        try:
            price = soup.find("span", attrs={'id': 'priceblock_dealprice'}).string.strip()
        except:
            price = "Not Available"
    return price

if __name__ == '__main__':
    # Add your user agent 
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0', 'Accept-Language': 'en-US, en;q=0.5'}

    # The webpage URL
    URL = "https://www.amazon.in/s?k=Laptops&crid=2C3PN1BUCAA5W&sprefix=laptops%2Caps%2C294&ref=nb_sb_noss_1"

    # HTTP Request
    webpage = requests.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

    # Store the links
    links_list = [link.get('href') for link in links]

    # Loop for extracting product details (title and price) from each link
    product_details = []
    for link in links_list:
        try:
            new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            title = get_title(new_soup)
            price = get_price(new_soup)
            product_details.append({'Title': title, 'Price': price})
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {link}: {e}")

    # Create DataFrame
    amazon_df = pd.DataFrame(product_details)

    # Drop rows with missing title
    amazon_df.dropna(subset=['Title'], inplace=True)

    # Save DataFrame to CSV
    amazon_df.to_csv("amazon_title_price.csv", index=False)

    # Display DataFrame
    print(amazon_df)
