In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [40]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        # First, try to get the regular price
        price = soup.find("span", class_='a-price a-text-price a-size-medium apexPriceToPay')
        if price:
            price = price.find('span', class_='a-offscreen').text.strip()
        else:
            # If that price isn't found, check for the deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'})
            if price:
                price = price.string.strip()
            else:
                # If no standard price, look for a custom price in a class
                price = soup.find("span", attrs={'class':'a-price-whole'})
                if price:
                    price = price.text.strip()
                else:
                    price = ""
    except AttributeError:
        price = ""  # Return empty string if nothing is found

    return price


# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

In [50]:
if __name__ == '__main__':

    # Headers for request (Make sure these are up-to-date and valid)
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    # The webpage URL
    URL = "https://www.amazon.com/s?k=playstation+4&crid=FP5WE1F2V4XQ&sprefix=playstation+4%2Caps%2C605&ref=nb_sb_noss_1"

    # HTTP Request
    response = requests.get(URL, headers=HEADERS)
    if response.status_code != 200:
        print(f"Request failed with status code {response.status_code}")
    else:
        # Soup Object containing all data
        soup = BeautifulSoup(response.content, "html.parser")

        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})

        # Store the links
        links_list = [link.get('href') for link in links if link.get('href')]

        # Initialize dictionary to store product details
        d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}
        
        # Loop for extracting product details from each link 
        for link in links_list:
            # Check if the link starts with "http" or "https" to avoid concatenation errors
            if link.startswith("http"):
                product_url = link  # Use the full URL directly
            else:
                product_url = "https://amazon.com" + link
                
            new_webpage = requests.get(product_url, headers=HEADERS)
            if new_webpage.status_code == 200:
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")
                # Function calls to extract data
                d['title'].append(get_title(new_soup))
                d['price'].append(get_price(new_soup))
                d['rating'].append(get_rating(new_soup))
                d['reviews'].append(get_review_count(new_soup))
                d['availability'].append(get_availability(new_soup))
            else:
                print(f"Failed to fetch product page: {product_url}")

        # Create a DataFrame from the dictionary
        amazon_df = pd.DataFrame.from_dict(d)
        # Handle missing or empty values
        amazon_df = amazon_df.replace({'title': {'': np.nan}})
        amazon_df = amazon_df.dropna(subset=['title'])


In [54]:
amazon_df.to_csv("D:/Information_reterival_project/Web Scraping for Amazon/amazon_data.csv", header=True, index=False)

In [56]:
d

{'title': ['PlayStation®4 Console – Call of Duty® Modern Warfare II Bundle',
  '$250 PlayStation Store Gift Card [Digital Code]',
  'PS4 Controller Charger Dock Station, OIVO 1.8Hrs PS4 Controller Charging Dock, Charging Station Replacement for PlayStation 4 Dualshock 4 Charger',
  'Wuthur PS4 Controller Wireless 2 Pack, Compatible with PlayStation 4 - Black & White',
  'FASIGO PS4 Controller 2 Pack, Wireless PS4 Controller for PS4 / Pro/Slim & PC, with 2 USB C Cable - Black',
  'Seagate (STGD2000100) Game Drive for PS4 Systems 2TB External Hard Drive Portable HDD â€“ USB 3.0, Officially Licensed Product',
  'Wireless Controller Dual Vibration Game Joystick Controller for Ps4 Controller/Slim/Pro,Compatible with PS4 Console',
  'Sony DualShock 4 Wireless Controller - Midnight Blue - PlayStation 4',
  'Wireless Controller for PS4, Custom Design V2 Gamepad Joystick for PS4 with Non-Slip Grip of Both Sides and 3.5mm Audio Jack! Thumb Caps Included! (Galaxy)',
  'PlayStation 4 Slim 1TB Limi

In [58]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
0,PlayStation®4 Console – Call of Duty® Modern W...,$379.94,4.5 out of 5 stars,116 ratings,In Stock
1,$250 PlayStation Store Gift Card [Digital Code],250.,4.6 out of 5 stars,"253,044 ratings",Available now
2,"PS4 Controller Charger Dock Station, OIVO 1.8H...",$14.98,4.7 out of 5 stars,"39,483 ratings",In Stock
3,"Wuthur PS4 Controller Wireless 2 Pack, Compati...",$31.98,4.3 out of 5 stars,185 ratings,In Stock
4,"FASIGO PS4 Controller 2 Pack, Wireless PS4 Con...",$31.99,4.2 out of 5 stars,460 ratings,In Stock
5,Seagate (STGD2000100) Game Drive for PS4 Syste...,$108.00,4.7 out of 5 stars,"29,920 ratings",
6,Wireless Controller Dual Vibration Game Joysti...,$19.99,4.0 out of 5 stars,"2,607 ratings",In Stock
7,Sony DualShock 4 Wireless Controller - Midnigh...,$79.99,4.6 out of 5 stars,"3,486 ratings",In Stock
8,"Wireless Controller for PS4, Custom Design V2 ...",$19.99,4.2 out of 5 stars,"3,113 ratings",In Stock
9,PlayStation 4 Slim 1TB Limited Edition Console...,12.,5.0 out of 5 stars,2 ratings,Not Available
