In [1]:
import requests
import time
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Set the path to the Chromedriver
DRIVER_PATH = 'C:/Users/yurek/Downloads/chromedriver-win64/chromedriver-win64/chromedriver.exe'

service = Service(executable_path=DRIVER_PATH)
options = webdriver.ChromeOptions()

# Enable headless mode
options.add_argument('--headless=new')
options.add_argument("--window-size=1920,1200")  # Set the window size

# Initialize the Chrome driver
driver = webdriver.Chrome(service=service, options=options)

URL = 'https://www.sony.co.in/compact-cameras/vlog-cameras'
driver.get(URL)

# Wait for content to load (adjust selector to match the page's content)
try:
    WebDriverWait(driver, 100).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "GalleryListItem__ButtonContainer"))
    )
except Exception as e:
    print(f"Error waiting for content: {e}")

# Get the dynamically loaded HTML
time.sleep(2)  # Add extra wait time if needed
webpage = driver.page_source

In [3]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(webpage, 'html.parser')
all_links = soup.find_all('a', class_='GalleryListItem__Button js-datalayer-action-event') 
product_links = [link['href'] for link in all_links]

def correct_links(link):
    if 'https' not in link:
        return 'https://www.sony.co.in' + link
    return link

final_links = [correct_links(link) for link in product_links]
print(final_links)
print(len(final_links))

['https://www.sony.co.in/electronics/cyber-shot-compact-cameras/zv-1m2', 'https://www.sony.co.in/compact-cameras/products/zv-1f', 'https://www.sony.co.in/compact-cameras/products/zv-1']
3


In [4]:
final_links.extend(['https://www.sony.co.in/interchangeable-lens-cameras/products/ilme-fx3', 'https://www.sony.co.in/interchangeable-lens-cameras/products/ilce-1', 'https://www.sony.co.in/interchangeable-lens-cameras/products/ilce-7c', 'https://www.sony.co.in/interchangeable-lens-cameras/products/ilce-7sm3', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-9m2', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-6600', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-7rm2', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-7rm4a', 'https://www.sony.co.in/interchangeable-lens-cameras/products/ilce-6400', 'https://www.sony.co.in/interchangeable-lens-cameras/products/ilce-7m3-body-kit', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-7sm2', 'https://www.sony.co.in/electronics/interchangeable-lens-cameras/ilce-7rm4'])

In [4]:
# Function to extract product title

def get_title(soup):
    
    try:
        # Outer tag object 
        title = soup.find('h1', class_='CategoryNav__PdpHeaderTitleName')

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

In [6]:
# Function to extract Product Price

def get_model(soup):
    
    try:
        models = soup.find_all('h4', class_='ProductSummaryModels__ModelCode')
        models_value = [model.text for model in models]
        model_list = [model_value.strip() for model_value in models_value]

    except AttributeError:
        model_list = []
    
    return model_list

In [7]:
def get_description(soup):
    
    try:
        title = soup.find('p', class_='ProductSummary__Headline').text
        description = soup.find('div', class_='ProductSummary__BodyCopy').text
        product_description = title + ' : ' + description

    except AttributeError:
        product_description = ''
    
    return product_description

In [8]:
def get_card_highlights(soup):
    
    try:
        # Outer tag object 
        card_highlights = soup.find('div', class_='CategoryNav__PdpHeaderModel')

        # Inner NavigatableString Object
        card_highlights = card_highlights.text

        # Title as a string value
        card_highlights = card_highlights.strip()

    except AttributeError:
        card_highlights = ""

    return card_highlights

In [9]:
def get_combo(soup):
    
    try:
        sizes = soup.find_all('div', class_='ProductSummaryModels__Differenciator')
        sizes_value = [size.text for size in sizes]
        size_list = [size_value.strip() for size_value in sizes_value]

    except AttributeError:
        size_list = []
    
    return size_list

In [10]:
def get_price(soup):

    try:
        prices = soup.find_all("span", class_='ProductSummaryModels__ModelPriceNumber js-product-price js-datalayer-price-details')
        prices_value = [price.text for price in prices]
        price_list = [price_value.strip() for price_value in prices_value]

    except AttributeError:
        price_list = []

    return price_list

In [11]:
# Function to extract product Reviews

def get_rating(soup):
    
    try:
        # Outer tag object 
        review = soup.find('span', class_='StarRatings__Count js-datalayer-averageRating')

        # Inner NavigatableString Object
        review_value = review.text

        # Title as a string value
        review_string = review_value.strip()

    except AttributeError:
        review_string = ""

    return review_string

In [12]:
# Function to extract product Reviews

def get_review_count(soup):
    
    try:
        # Outer tag object 
        count = soup.find('span', class_='ProductSummary__TotalRate js-datalayer-reviewCount')

        # Inner NavigatableString Object
        count_value = count.get('data-review-count')

        # Title as a string value
        count_string = count_value.strip()

    except AttributeError:
        count_string = ""
    
    return count_string

In [13]:
def get_product_images(soup):

    try:
        # Outer tag object
        images = soup.find_all('div', class_='ProductIntroPlate__ThumbImage -LargeScreen')
        image_links = [image.get('data-background-image-hires') for image in images]
    
    except AttributeError:
        image_links = []

    return image_links

In [14]:
def get_product_features(soup):

    try:
        # Outer tag object
        features = soup.find_all('p', class_='Highlights__SpecValue')
        feature_list = [feature.text for feature in features]

    except AttributeError:
        feature_list = []

    return feature_list

In [15]:
def get_product_specifications(soup):
    
    try:
        spec_key = soup.find_all('dt', class_='l3')
        spec_key_list = [key.text for key in spec_key]

        spec_value = soup.find_all('dd', class_='p3')
        spec_value_list = [value.text for value in spec_value]

        spec = dict(zip(spec_key_list, spec_value_list))

    except AttributeError:
        spec = {}

    return spec    

In [24]:
data = {'name': [], 'model': [], 'combo': [], 'price': [], 'rating': [], 'review_count': [], 'images': [], 'highlights': [], 'description': [], 'specification': [], 'features': [], 'sub_category': 'APS-C E-mount Mirrorless', 'category': 'Interchangeable-lens Cameras'}

for link in final_links:
    driver.get(link)
    new_webpage = driver.page_source
    new_soup = BeautifulSoup(new_webpage, 'html.parser')

    # The function call to display all the product information
    data['name'].append(get_title(new_soup))
    data['model'].append(get_model(new_soup))
    data['combo'].append(get_combo(new_soup))
    data['price'].append(get_price(new_soup))
    data['rating'].append(get_rating(new_soup))
    data['review_count'].append(get_review_count(new_soup))
    data['images'].append(get_product_images(new_soup))
    data['highlights'].append(get_card_highlights(new_soup))
    data['description'].append(get_description(new_soup))
    data['specification'].append(get_product_specifications(new_soup))
    data['features'].append(get_product_features(new_soup))

In [25]:
sony_df = pd.DataFrame.from_dict(data)
sony_df['name'].replace('', np.nan, inplace=True)
sony_df = sony_df.dropna(subset=['name'])
sony_df

Unnamed: 0,name,model,combo,price,rating,review_count,images,highlights,description,specification,features,sub_category,category
2,ZV-E10,"[ZV-E10, ZV-E10L]","[Body Only, Body + 16–50 mm Power Zoom Lens]","[Rs.59,490, Rs.69,990]",4.5,61,[https://sony.scene7.com/is/image/sonyglobalso...,Interchangeable-lens vlog camera,Capture more of your world : \n*Simulated scre...,{},"[Interchangeable-lens camera for vlogging, Lar...",APS-C E-mount Mirrorless,Interchangeable-lens Cameras
4,ILCE-6400/ILCE-6400L/ILCE-6400M,"[ILCE-6400, ILCE-6400L, ILCE-6400M]","[Body Only, Body + 16–50 mm Power Zoom Lens, B...","[Rs.80,490, Rs.90,990, Rs.116,490]",4.5,198,[https://sony.scene7.com/is/image/sonyglobalso...,Alpha 6400 E-mount camera with APS-C sensor,Grab the best of life. : \nDespite its compact...,{},"[Fast 0.02 sec.4 AF, with 425 phase-detection ...",APS-C E-mount Mirrorless,Interchangeable-lens Cameras


In [26]:
sony_df.to_csv("C:/Users/yurek/Downloads/sony_cameras_mirrorless1.csv", header=True, index=False)