In [1]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [2]:
# Set the path to the Chromedriver
DRIVER_PATH = 'C:/Users/yurek/Downloads/chromedriver-win64/chromedriver-win64/chromedriver.exe'

service = Service(executable_path=DRIVER_PATH)
options = webdriver.ChromeOptions()

# Enable headless mode
options.add_argument('--headless=new')
options.add_argument("--window-size=1920,1200")  # Set the window size

# Initialize the Chrome driver
driver = webdriver.Chrome(service=service, options=options)

URL = 'https://www.sony.co.in/headphones/water-resistant'

driver.get(URL)
webpage = driver.page_source

In [3]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(webpage, 'html.parser')
all_links = soup.find_all('a', class_='GalleryListItem__Button js-datalayer-action-event')
product_links = [link['href'] for link in all_links]

def correct_links(link):
    if 'https' not in link:
        return 'https://www.sony.co.in' + link
    return link

final_links = [correct_links(link) for link in product_links]
print(final_links)
print(len(final_links))

['https://www.sony.co.in/content/sony/in/in/en-in/general/audio/headphones/products/wf-l910.html', 'https://www.sony.co.in/headphones/products/wf-c510', 'https://www.sony.co.in/headphones/products/wf-c700n', 'https://www.sony.co.in/headphones/products/wf-1000xm5', 'https://www.sony.co.in/electronics/truly-wireless/linkbuds-s', 'https://www.sony.co.in/electronics/truly-wireless/wf-xb700', 'https://www.sony.co.in/electronics/truly-wireless/wf-c500']
7


In [4]:
# Function to extract product title

def get_title(soup):
    
    try:
        # Outer tag object 
        title = soup.find('div', class_='CategoryNav__PdpHeaderModel')

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price

def get_price(soup):

    try:
        price = soup.find("span", class_='ProductSummaryModels__ModelPriceNumber js-product-price js-datalayer-price-details')
        price_value = price.text
        price_string = price_value.strip()
        # price_string = re.sub(r'[^0-9]', '', price_string)

    except AttributeError:
        price_string = ""

    return price_string

# Function to extract product Reviews

def get_rating(soup):
    
    try:
        # Outer tag object 
        review = soup.find('span', class_='StarRatings__Count js-datalayer-averageRating')

        # Inner NavigatableString Object
        review_value = review.text

        # Title as a string value
        review_string = review_value.strip()

    except AttributeError:
        review_string = ""

    return review_string

# Function to extract product Reviews

def get_review_count(soup):
    
    try:
        # Outer tag object 
        count = soup.find('span', class_='ProductSummary__TotalRate js-datalayer-reviewCount')

        # Inner NavigatableString Object
        count_value = count.get('data-review-count')

        # Title as a string value
        count_string = count_value.strip()

    except AttributeError:
        count_string = ""
    
    return count_string

def get_product_images(soup):

    try:
        # Outer tag object
        images = soup.find_all('div', class_='ProductIntroPlate__ThumbImage -LargeScreen')
        image_links = [image.get('data-background-image-hires') for image in images]
    
    except AttributeError:
        image_links = []

    return image_links

def get_product_features(soup):

    try:
        # Outer tag object
        features = soup.find_all('p', class_='Highlights__SpecValue')
        feature_list = [feature.text for feature in features]

    except AttributeError:
        feature_list = []

    return feature_list

In [5]:
data = {'name': [], 'price': [], 'rating': [], 'review_count': [], 'images': [], 'features': [], 'sub_category': 'Water Resistant', 'category': 'Headphones'}

for link in final_links:
    driver.get(link)
    new_webpage = driver.page_source
    new_soup = BeautifulSoup(new_webpage, 'html.parser')

    # The function call to display all the product information
    data['name'].append(get_title(new_soup))
    data['price'].append(get_price(new_soup))
    data['rating'].append(get_rating(new_soup))
    data['review_count'].append(get_review_count(new_soup))
    data['images'].append(get_product_images(new_soup))
    data['features'].append(get_product_features(new_soup))

In [6]:
sony_df = pd.DataFrame.from_dict(data)
sony_df['name'].replace('', np.nan, inplace=True)
sony_df = sony_df.dropna(subset=['name'])
sony_df

Unnamed: 0,name,price,rating,review_count,images,features,sub_category,category
0,LinkBuds series | LinkBuds Open Wireless Headp...,"Rs.26,990",4.4,28,[],[],Water Resistant,Headphones
1,WF-C510 Truly Wireless Headphones,"Rs.8,990",4.6,51,[https://sony.scene7.com/is/image/sonyglobalso...,[Compact and lightweight for comfort all day l...,Water Resistant,Headphones
2,WF-C700N Wireless Noise Cancelling Headphones,"Rs.12,990",4.1,199,[https://sony.scene7.com/is/image/sonyglobalso...,[Digital noise cancelling and Ambient Sound Mo...,Water Resistant,Headphones
3,WF-1000XM5 Wireless Noise Cancelling Headphones,"Rs.29,990",4.0,418,[https://sony.scene7.com/is/image/sonyglobalso...,[The best noise cancelling1 with two high-perf...,Water Resistant,Headphones


In [7]:
sony_df.to_csv("C:/Users/yurek/Downloads/sony_headphones1.csv", header=True, index=False)

In [None]:
driver.get(final_links[0])
response = driver.page_source
soup1 = BeautifulSoup(response, 'html.parser')
title = get_title(soup1)
print(title)
price = get_price(soup1)
print(price)
rating = get_rating(soup1)
print(rating)
count = get_review_count(soup1)
print(count)
image_links = get_product_images(soup1)
print(image_links)
feature_list = get_product_features(soup1)
print(feature_list)