In [16]:
import requests
import time
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [17]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless for efficiency
driver = webdriver.Chrome(options=options)

# Open the target URL
driver.get("https://www.sony.co.in/compact-cameras/vlog-cameras")

# Wait for the page to load
time.sleep(3)  # Adjust this as needed for the webpage's initial loading time

# Incremental scroll simulation
SCROLL_PAUSE_TIME = 1  # Time to pause between scrolls (adjust as needed)
SCROLL_INCREMENT = 300  # Scroll increment in pixels (adjust for optimal results)

# Get the initial scroll height
current_height = 0
while True:
    # Incrementally scroll down
    driver.execute_script(f"window.scrollTo(0, {current_height});")
    time.sleep(SCROLL_PAUSE_TIME)  # Pause to allow content to load

    # Update the current height
    current_height += SCROLL_INCREMENT

    # Check if no new content has loaded
    new_height = driver.execute_script("return document.body.scrollHeight")
    if current_height >= new_height:
        break  # Break if end of content is reached

# Parse the loaded content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Extract product links
all_links = soup.find_all('a', class_='GalleryListItem__Button js-datalayer-action-event') 
product_links = [link.get('href') for link in all_links if link.get('href')]  # Safely get href

# Correct and complete links
def correct_links(link):
    if not link.startswith('https'):  # Check for relative URLs
        return 'https://www.sony.co.in' + link
    return link

# Build final list of links
final_links = list(set(correct_links(link) for link in product_links))  # Remove duplicates

# Close the browser
driver.quit()

# Output results
print(final_links)
print(f"Total unique links: {len(final_links)}")


['https://www.sony.co.in/electronics/cyber-shot-compact-cameras/zv-1m2', 'https://www.sony.co.in/compact-cameras/products/zv-1f', 'https://www.sony.co.in/compact-cameras/products/zv-1']
Total unique links: 3


In [18]:
# Function to extract product title

def get_title(soup):
    
    try:
        # Outer tag object 
        title = soup.find('div', class_='product-model p5')

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

In [19]:
def get_card_highlights(soup):
    
    try:
        # Outer tag object 
        card_highlights = soup.find('h1', class_='t6 product-title')

        # Inner NavigatableString Object
        card_highlights = card_highlights.text

        # Title as a string value
        card_highlights = card_highlights.strip()

    except AttributeError:
        card_highlights = ""

    return card_highlights

In [20]:
def get_price(soup):

    try:
        price = soup.find("strong", class_='text-dark')
        price_value = price.text
        price_string = price_value.strip()
        # price_string = re.sub(r'[^0-9]', '', price_string)

    except AttributeError:
        price_string = ""

    return price_string

In [21]:
def get_rating(soup):
    
    try:
        # Outer tag object 
        review = soup.find('div', class_='product-rating')

        # Inner NavigatableString Object
        review_value = review.get('data-stars')

        # Title as a string value
        review_string = review_value.strip()

    except AttributeError:
        review_string = ""

    return review_string

In [22]:
def get_review_count(soup):
    
    try:
        # Outer tag object 
        count = soup.find('span', class_='review-count').text
        # Inner NavigatableString Object
        count_value = count #.get('data-review-count')

        # Title as a string value
        count_string = count_value.strip()

    except AttributeError:
        count_string = ""
    
    return count_string

In [23]:
def get_product_images(soup):

    try:
        # Outer tag object
        buy_link = soup.find('a', 'btn btn-large btn-block buy buy-button retailer_btn-align').get('href')
        if not buy_link.startswith('https'):  # Check for relative URLs
            buy_link = 'https:' + buy_link
        print(buy_link)
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Run headless for efficiency
        driver = webdriver.Chrome(options=options)

        # Open the target URL
        driver.get(buy_link)

        buy_page = driver.page_source
        image_soup = BeautifulSoup(buy_page, 'html.parser')

        images = image_soup.find_all('img', class_='0 iq-img')
        image_links = [image.get('src') for image in images]

        # Close the driver after all batches are processed
        driver.quit()
    
    except AttributeError:
        image_links = []

    return image_links

In [24]:
def get_product_features(soup):

    try:
        # Outer tag object
        features = soup.find_all('p', class_='copy ghost-center with-icon')
        feature_list = [feature.text for feature in features]

    except AttributeError:
        feature_list = []

    return feature_list

In [25]:
def get_product_specifications(soup):
    
    try:
        spec_key = soup.find_all('dt', class_='l3')
        spec_key_list = [key.text for key in spec_key]

        spec_value = soup.find_all('dd', class_='p3')
        spec_value_list = [value.text for value in spec_value]

        spec = dict(zip(spec_key_list, spec_value_list))

    except AttributeError:
        spec = {}

    return spec    

In [26]:
def get_description(soup):
    
    try:
        buy_link = soup.find('a', 'btn btn-large btn-block buy buy-button retailer_btn-align').get('href')
        if not buy_link.startswith('https'):  # Check for relative URLs
            return 'https:' + buy_link
        driver.get(buy_link)
        buy_page = driver.page_source
        desc_soup = BeautifulSoup(buy_page, 'html.parser')

        title = desc_soup.find('p', class_='p3 elevator-pitch').text
        description = desc_soup.find('p', class_='p3').text
        product_description = title + ' : ' + description

    except AttributeError:
        product_description = ''
    
    return product_description

In [27]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless for efficiency
driver = webdriver.Chrome(options=options)
driver.get(final_links[1])
response = driver.page_source
soup1 = BeautifulSoup(response, 'html.parser')
title = get_title(soup1)
print(title)
price = get_price(soup1)
print(price)
rating = get_rating(soup1)
print(rating)
count = get_review_count(soup1)
print(count)
image_links = get_product_images(soup1)
print(image_links)
card_highlights = get_card_highlights(soup1)
print(card_highlights)
product_description = get_description(soup1)
print(product_description)
feature_list = get_product_features(soup1)
print(feature_list)
specification = get_product_specifications(soup1)
print(specification)





[]


[]
{}


In [28]:
from time import sleep

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless for efficiency
driver = webdriver.Chrome(options=options)

# Example data structure (ensure to initialize correctly)
data = {'name': [], 'price': [], 'rating': [], 'review_count': [], 'images': [], 'highlights': [], 
        'description': [], 'specification': [], 'features': [], 'sub_category': 'Vlog Camera', 'category': 'Compact Cameras'}

BATCH_SIZE = 5  # Number of links to process in each batch

# Iterate through the links in batches
for start in range(0, len(final_links), BATCH_SIZE):
    batch_links = final_links[start:start + BATCH_SIZE]  # Slice the batch

    for link in batch_links:
        driver.get(link)
        new_webpage = driver.page_source
        new_soup = BeautifulSoup(new_webpage, 'html.parser')

        # The function call to display all the product information
        data['name'].append(get_title(new_soup))
        data['price'].append(get_price(new_soup))
        data['rating'].append(get_rating(new_soup))
        data['review_count'].append(get_review_count(new_soup))
        data['images'].append(get_product_images(new_soup))
        data['highlights'].append(get_card_highlights(new_soup))
        data['description'].append(get_description(new_soup))
        data['specification'].append(get_product_specifications(new_soup))
        data['features'].append(get_product_features(new_soup))

    # Optional: Add a delay between batches to avoid overwhelming the server
    print(f"Completed batch {start // BATCH_SIZE + 1}/{-(-len(final_links) // BATCH_SIZE)}")  # Track progress
    sleep(3)  # Pause between batches (adjust as needed)

# Close the driver after all batches are processed
driver.quit()

# Verify the collected data
print(f"Total products processed: {len(data['name'])}")

https://www.sony.co.in/electronics/cyber-shot-compact-cameras/zv-1m2/buy
Completed batch 1/1
Total products processed: 3


In [29]:
sony_df = pd.DataFrame.from_dict(data)
sony_df['name'].replace('', np.nan, inplace=True)
sony_df = sony_df.dropna(subset=['name'])
sony_df

Unnamed: 0,name,price,rating,review_count,images,highlights,description,specification,features,sub_category,category
0,ZV-1M2,"Rs. 86,9901",4.0,8,[https://www.sony.co.in/image/07eb08cb6a21a370...,Vlog Camera ZV-1 II,https://www.sony.co.in/electronics/cyber-shot-...,{},[Versatile 18–50 mm2 wide-angle zoom lens idea...,Vlog Camera,Compact Cameras


In [30]:
sony_df.to_csv("C:/Users/yurek/Downloads/sony_cameras_compact-camera1.csv", header=True, index=False)