In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
# Function to extract Product Title
def get_title(soup):

  try:
    # Outer Tag Object
    title = soup.find("span", attrs={"id": 'productTitle'})

    # Inner NavigatableString Object
    title_value =title.text

    #Title as a string value
    title_string = title_value.strip()

  except AttributeError:
    title_string = " "

  return title_string

# Function to extract Product Price
def get_price(soup):

  try:
    price = soup.find("span", attrs={"class":'a-price aok-align-center reinventPricePriceToPayMargin priceToPay'}).find("span", attrs={"class": "a-offscreen"}).text

  except AttributeError:
    price = " "

  return price

# Function to extract product rating
def get_rating(soup):

  try:
    rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-icon-4-5'}).string.strip()

  except AttributeError:
    try:
      rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
    except:
      rating = " "

  return rating

# Function to extract number of user reviews
def get_review_count(soup):
  try:
    review_count = soup.find("span", attrs={'class':'a-size-base s-underline-text'}).string.strip()

  except AttributeError:
    review_count = " "

  return review_count

# Function to extract avalability status
def get_availability(soup):
  try:
    available = soup.find("div", attrs={'id':'availability'})
    available = available.find("span").string.strip()

  except AttributeError:
    available = "Not Available"

  return available




In [None]:
# The webpage URL
def get_amazon_data(url, HEADERS): 
  # HTTP Request
  webpage = requests.get(url, headers=HEADERS)

  # Soup object containing all data
  soup = BeautifulSoup(webpage.content, "html.parser")

  # Fetch links as List of Tag Objects
  links = soup.find_all("a", attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

  # Store the links
  links_list = []

  # Loop for extracting links from tag objects
  for link in links:
    links_list.append(link.get('href'))

  d = {"title":[], "price":[], "rating":[], "reviews":[], "availability":[]}

  # Loop for extracting product details from each link
  for link in links_list:
    print(f'=> visiting {link}')
    new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
    new_soup = BeautifulSoup(new_webpage.content, "html.parser")

    # Function calls to display all necessary product information
    d['title'].append(get_title(new_soup))
    d['price'].append(get_price(new_soup))
    d['rating'].append(get_rating(new_soup))
    d['reviews'].append(get_review_count(new_soup))
    d['availability'].append(get_availability(new_soup))

  amazon_df = pd.DataFrame.from_dict(d)
  amazon_df['title'].replace('', np.nan, inplace=True)
  amazon_df = amazon_df.dropna(subset=['title'])
  amazon_df.to_csv("amazon_data.csv", header=True, index=False)
  return amazon_df

def collect_all(q='laptop', pos=1):
  # Headers for request
  HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
  results = []
  while True:
    print(f'Collecting data from amazon for {q} from page {pos}')
    amazon_url = f"https://www.amazon.in/s?k={q}&page={pos}"
    out = get_amazon_data(amazon_url, HEADERS)
    print(out.shape)
    if len(out) > 0:
      pos+=1
      results.append(out)
      print('results updated')
    else:
      print(len(out), 'items found')
      break
  return results


In [None]:
data = collect_all()

Collecting data from amazon for laptop from page 1
=> visiting /HP-15-6-inches-Graphics-Speakers-15s-du3614TU/dp/B0B6F6PM6C/ref=sr_1_1?keywords=laptop&qid=1681134692&sr=8-1
=> visiting /HP-Laptop-Graphics-Anti-Glare-Warranty/dp/B0BVQJ5R2H/ref=sr_1_2?keywords=laptop&qid=1681134692&sr=8-2
=> visiting /Dell-Inspiron-Laptop-i5-1235U-D560874WIN9B/dp/B0BQJ8KL7Q/ref=sr_1_3?keywords=laptop&qid=1681134692&sr=8-3
=> visiting /ASUS-Vivobook-i5-1235U-X1405ZA-KM511WS-Additional/dp/B0BHGVW38M/ref=sr_1_4?keywords=laptop&qid=1681134692&sr=8-4
=> visiting /ASUS-15-6-inch-GeForce-Windows-FA506IHRZ-HN111W/dp/B0B5DZTNZQ/ref=sr_1_5?keywords=laptop&qid=1681134692&sr=8-5
=> visiting /Dell-Inspiron-Windows-i3-1115G4-39-62Cms/dp/B0B468SB8G/ref=sr_1_6?keywords=laptop&qid=1681134692&sr=8-6
=> visiting /Lenovo-Anti-Glare-Warranty-Platinum-81WQ00NXIN/dp/B0B7R7HFKL/ref=sr_1_7?keywords=laptop&qid=1681134692&sr=8-7
=> visiting /HP-3S7L2PA-Notebook-Business-Warranty/dp/B0BM45QW4L/ref=sr_1_8?keywords=laptop&qid=1681134