In [1]:
import requests
import time
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run headless for efficiency
driver = webdriver.Chrome(options=options)

# Open the target URL
driver.get("https://www.sony.co.in/headphones/headband")

# Wait for the page to load
time.sleep(3)  # Adjust this as needed for the webpage's initial loading time

# Incremental scroll simulation
SCROLL_PAUSE_TIME = 1  # Time to pause between scrolls (adjust as needed)
SCROLL_INCREMENT = 300  # Scroll increment in pixels (adjust for optimal results)

# Get the initial scroll height
current_height = 0
while True:
    # Incrementally scroll down
    driver.execute_script(f"window.scrollTo(0, {current_height});")
    time.sleep(SCROLL_PAUSE_TIME)  # Pause to allow content to load

    # Update the current height
    current_height += SCROLL_INCREMENT

    # Check if no new content has loaded
    new_height = driver.execute_script("return document.body.scrollHeight")
    if current_height >= new_height:
        break  # Break if end of content is reached

# Parse the loaded content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [3]:
def correct_links(link):
    if 'https' not in link:
        return 'https://www.sony.co.in' + link
    return link

In [4]:
# Function to extract product title

def get_title(soup):
    
    try:
        # Outer tag object 
        title = soup.find_all('div', class_='GalleryListItem__ProductCode')
        title_value = [name.text for name in title]
        title_string = [name.strip() for name in title_value]

    except AttributeError:
        title_string = ""

    return title_string

In [5]:
def get_product_images(soup):

    try:
        # Outer tag object
        images = soup.find_all('img', class_='v-lazy-image v-lazy-image-loaded')
        image_links = [image.get('src') for image in images]
    
    except AttributeError:
        image_links = []

    return image_links

In [17]:
names = get_title(soup)
images = get_product_images(soup)
images = set(images)

# Step 1: Filter the images list
filtered_images = []
for image in images:
    if "https" in image:
        filtered_images.append(image)  # Add directly if it contains "https"
    elif image.startswith("/image"):
        processed_image = correct_links(image)  # Process if it starts with "/image"
        filtered_images.append(processed_image)

# Step 2: Associate names with filtered images
result = []
processed_names = set()  # Track processed names to avoid duplicates

for name in names:
    for image in filtered_images:
        if name.lower() in image and name.lower() not in processed_names:
            result.append({"name": name, "image": image})
            processed_names.add(name)  # Mark the name as processed
            break  # Move to the next name after finding a match

In [18]:
print(names)
print(filtered_images)

['MDR-M1', 'MDR-7506', 'MDR-MV1', 'WH-1000XM5', 'WH-1000XM4', 'ULT WEAR', 'WH-CH720N', 'WH-CH520', 'MDR-ZX110A/MDR-ZX110AP/null']
['https://sony.scene7.com/is/image/sonyglobalsolutions/wh-ch520_Primary_image_beige?$mediaCarouselSmall$&fmt=png-alpha', 'https://www.sony.co.in/image/8e2364ea770052eb890f78ce3ad936f5?fmt=png-alpha&wid=660&hei=660', 'https://sony.scene7.com/is/image/sonyglobalsolutions/wh-ch720_Primary_image_white?$mediaCarouselSmall$&fmt=png-alpha', 'https://sony.scene7.com/is/image/sonyglobalsolutions/OMH_Primary_image_1200?$mediaCarouselSmall$&fmt=png-alpha', 'https://www.sony.co.in/image/38ea5815d12ab90a45b9b1a35520b794?fmt=png-alpha&wid=660&hei=660', 'https://sony.scene7.com/is/image/sonyglobalsolutions/wh-ch720_Primary_image_blue?$mediaCarouselSmall$&fmt=png-alpha', 'https://sony.scene7.com/is/image/sonyglobalsolutions/Primary_image_1-1?$mediaCarouselSmall$&fmt=png-alpha', 'https://www.sony.co.in/image/5d02da5df552836db894cead8a68f5f3?fmt=png-alpha&wid=660&hei=660', 'h

In [19]:
print(len(filtered_images))

12


In [21]:
# Step 3: Create a DataFrame
df = pd.DataFrame(result)

# Save the DataFrame to a CSV file
df.to_csv('C://Users//yurek//Downloads//Headband_Headphone.csv', index=False)

print("CSV file created successfully!")

CSV file created successfully!
