In [1]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [2]:
# Set the path to the Chromedriver
DRIVER_PATH = 'C:/Users/yurek/Downloads/chromedriver-win64/chromedriver-win64/chromedriver.exe'

service = Service(executable_path=DRIVER_PATH)
options = webdriver.ChromeOptions()

# Enable headless mode
options.add_argument('--headless=new')
options.add_argument("--window-size=1920,1200")  # Set the window size

# Initialize the Chrome driver
driver = webdriver.Chrome(service=service, options=options)

URL = 'https://www.sony.co.in/bravia/google-tv'

driver.get(URL)
webpage = driver.page_source

In [3]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(webpage, 'html.parser')
all_links = soup.find_all('a', class_='GalleryListItem__Button js-datalayer-action-event')
product_links = [link['href'] for link in all_links]

def correct_links(link):
    if 'https' not in link:
        return 'https://www.sony.co.in' + link
    return link

final_links = [correct_links(link) for link in product_links]
print(final_links)
print(len(final_links))

['https://www.sony.co.in/bravia/products/bravia-9', 'https://www.sony.co.in/bravia/products/bravia-8', 'https://www.sony.co.in/bravia/products/bravia-7', 'https://www.sony.co.in/bravia/products/x64l-series', 'https://www.sony.co.in/bravia/products/x74l-series', 'https://www.sony.co.in/bravia/products/x75l-series', 'https://www.sony.co.in/bravia/products/a95l-series', 'https://www.sony.co.in/bravia/products/a80l-a83l-a84l-series']
8


In [4]:
# Function to extract product title

def get_title(soup):
    
    try:
        # Outer tag object 
        title = soup.find('h1', class_='CategoryNav__PdpHeaderTitleName')

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

In [5]:
# Function to extract Product Price

def get_model(soup):
    
    try:
        models = soup.find_all('h4', class_='ProductSummaryModels__ModelCode')
        models_value = [model.text for model in models]
        model_list = [model_value.strip() for model_value in models_value]

    except AttributeError:
        model_list = []
    
    return model_list

In [6]:
def get_description(soup):
    
    try:
        title = soup.find('p', class_='ProductSummary__Headline').text
        description = soup.find('div', class_='ProductSummary__BodyCopy').text
        product_description = title + ' : ' + description

    except AttributeError:
        product_description = ''
    
    return product_description

In [7]:
def get_card_highlights(soup):
    
    try:
        # Outer tag object 
        card_highlights = soup.find('div', class_='CategoryNav__PdpHeaderModel')

        # Inner NavigatableString Object
        card_highlights = card_highlights.text

        # Title as a string value
        card_highlights = card_highlights.strip()

    except AttributeError:
        card_highlights = ""

    return card_highlights

In [8]:
def get_size(soup):
    
    try:
        sizes = soup.find_all('div', class_='ProductSummaryModels__Differenciator')
        sizes_value = [size.text for size in sizes]
        size_list = [size_value.strip() for size_value in sizes_value]

    except AttributeError:
        size_list = []
    
    return size_list

In [9]:
def get_price(soup):

    try:
        prices = soup.find_all("span", class_='ProductSummaryModels__ModelPriceNumber js-product-price js-datalayer-price-details')
        prices_value = [price.text for price in prices]
        price_list = [price_value.strip() for price_value in prices_value]

    except AttributeError:
        price_list = []

    return price_list

In [10]:
# Function to extract product Reviews

def get_rating(soup):
    
    try:
        # Outer tag object 
        review = soup.find('span', class_='StarRatings__Count js-datalayer-averageRating')

        # Inner NavigatableString Object
        review_value = review.text

        # Title as a string value
        review_string = review_value.strip()

    except AttributeError:
        review_string = ""

    return review_string

In [11]:
# Function to extract product Reviews

def get_review_count(soup):
    
    try:
        # Outer tag object 
        count = soup.find('span', class_='ProductSummary__TotalRate js-datalayer-reviewCount')

        # Inner NavigatableString Object
        count_value = count.get('data-review-count')

        # Title as a string value
        count_string = count_value.strip()

    except AttributeError:
        count_string = ""
    
    return count_string

In [12]:
def get_product_images(soup):

    try:
        # Outer tag object
        images = soup.find_all('div', class_='ProductIntroPlate__ThumbImage -LargeScreen')
        image_links = [image.get('data-background-image-hires') for image in images]
    
    except AttributeError:
        image_links = []

    return image_links

In [13]:

def extract_key_value_pairs(input_array):
    result = []

    for string in input_array:
        # Match uppercase phrases ending with a colon, followed by their values
        matches = re.findall(r'([A-Z\s\.\-]+)\s*:\s*([^:]+)', string)
        for match in matches:
            key = match[0].strip()  # Convert key to title case
            value = match[1].strip()       # Trim whitespace around the value
            result.append(f"{key} = {value}")

    return result

def get_product_features(soup):

    try:
        # Find all <ul> elements with the class "GalleryListItem__TopFeatures"
        ul_tags = soup.find_all('ul', class_='GalleryListItem__TopFeatures')

        # Extract all <li> tags from these <ul> elements
        feature_list = []
        for ul in ul_tags:
            li_tags = ul.find_all('li')
            feature_list.extend([li.get_text(strip=True) for li in li_tags])

    except AttributeError:
        feature_list = []

    return feature_list

In [14]:
driver.get(final_links[0])
response = driver.page_source
soup1 = BeautifulSoup(response, 'html.parser')
title = get_title(soup1)
print(title)
model = get_model(soup1)
print(model)
size = get_size(soup1)
print(size)
price = get_price(soup1)
print(price)
rating = get_rating(soup1)
print(rating)
count = get_review_count(soup1)
print(count)
image_links = get_product_images(soup1)
print(image_links)
card_highlights = get_card_highlights(soup1)
print(card_highlights)
product_description = get_description(soup1)
print(product_description)
feature_list = get_product_features(soup)
print(feature_list)

BRAVIA 9
['K-75XR90', 'K-85XR90']
['189 cm (75)', '215 cm (85)']
['Rs.649,900', 'Rs.899,900']
4.8
5
[]
BRAVIA 9 | XR Processor | Mini LED | 4K Ultra HD | High Dynamic Range (HDR) | Smart TV (Google TV)
Our brightest 4K ever : 
Our flagship Mini LED TV and brightest 4K ever delivers pictures full of unparalleled brightness and incomparably beautiful colours. See all the action come vividly to life, perfectly matched by sound including the world's first Beam Tweeter in a TV. A truly unforgettable cinematic experience.

['ULTIMATE CONTRAST: High Peak Luminance | XR Backlight Master Drive™', "WORLD'S FIRST TV WITH BEAM TWEETER: Acoustic Multi Audio+™", 'FLAWLESS PICTURE, LESS ENERGY: Eco Dashboard 2', "EVERYTHING YOU STREAM. ALL ON ONE SCREEN.: Google TV™* (*Google TV is the name of this device's software experience and a trademark of Google LLC.)", 'DESIGN BLENDS IN WITH YOUR LIVING SPACE: Harmonic Presence', 'LEVEL UP YOUR GAMING: Perfect for PlayStation 5', 'SLIM AND CHIC DESIGN: Harmon

In [15]:
data = {'name': [], 'model': [], 'size': [], 'price': [], 'rating': [], 'review_count': [], 'images': [], 'highlights': [], 'description': [], 'features': [], 'sub_category': 'Google TV', 'category': 'TVs'}

for link in final_links:
    driver.get(link)
    new_webpage = driver.page_source
    new_soup = BeautifulSoup(new_webpage, 'html.parser')

    # The function call to display all the product information
    data['name'].append(get_title(new_soup))
    data['model'].append(get_model(new_soup))
    data['size'].append(get_size(new_soup))
    data['price'].append(get_price(new_soup))
    data['rating'].append(get_rating(new_soup))
    data['review_count'].append(get_review_count(new_soup))
    data['images'].append(get_product_images(new_soup))
    data['highlights'].append(get_card_highlights(new_soup))
    data['description'].append(get_description(new_soup))
    data['features'].append(get_product_features(soup))

In [16]:
sony_df = pd.DataFrame.from_dict(data)
sony_df

Unnamed: 0,name,model,size,price,rating,review_count,images,highlights,description,features,sub_category,category
0,BRAVIA 9,"[K-75XR90, K-85XR90]","[189 cm (75), 215 cm (85)]","[Rs.649,900, Rs.899,900]",4.8,5.0,[],BRAVIA 9 | XR Processor | Mini LED | 4K Ultra ...,Our brightest 4K ever : \nOur flagship Mini LE...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
1,BRAVIA 8,"[K-55XR80, K-65XR80]","[139 cm (55), 164 cm (65)]","[Rs.299,900, Rs.429,900]",4.4,20.0,[],BRAVIA 8 | XR Processor | OLED | 4K Ultra HD |...,"Aesthetic slimness, blends in beautifully : \n...",[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
2,BRAVIA 7,"[K-55XR70, K-65XR70, K-75XR70]","[139 cm (55), 164 cm (65), 189 cm (75)]","[Rs.249,990, Rs.299,990, Rs.449,990]",4.1,15.0,[],BRAVIA 7 | XR Processor | Mini LED | 4K Ultra ...,"Perfectly balanced, entertains everyone brilli...",[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
3,X64L Series,"[KD-43X64L, KD-50X64L]","[108 cm (43), 126 cm (50)]","[Rs.59,900, Rs.74,900]",4.2,,[https://sony.scene7.com/is/image/sonyglobalso...,X64L | 4K Ultra HD | High Dynamic Range (HDR) ...,Thrilling entertainment.4K HDR. : \nOur smart ...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
4,X74L Series,"[KD-55X74L, KD-65X74L]","[139 cm (55), 164 cm (65)]","[Rs.99,900, Rs.139,900]",4.2,,[https://sony.scene7.com/is/image/sonyglobalso...,X74L | 4K Ultra HD | High Dynamic Range (HDR) ...,Thrilling entertainment.4K HDR. : \nOur smart ...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
5,X75L Series,"[KD-43X75L, KD-50X75L, KD-55X75L, KD-65X75L]","[108 cm (43), 126 cm (50), 139 cm (55), 164 cm...","[Rs.69,900, Rs.85,900, Rs.99,900, Rs.139,900]",4.2,,[https://sony.scene7.com/is/image/sonyglobalso...,X75L | 4K Ultra HD | High Dynamic Range (HDR) ...,Thrilling entertainment.4K HDR. : \nOur smart ...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
6,A95L Series,"[XR-55A95L, XR-65A95L]","[139 cm (55), 164 cm (65)]","[Rs.399,900, Rs.479,900]",4.5,247.0,[https://sony.scene7.com/is/image/sonyglobalso...,A95L | BRAVIA XR | MASTER Series | OLED | 4K U...,Infinite colours.Definitive contrast. : \nOur ...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs
7,A80L Series,"[XR-55A80L, XR-65A80L, XR-77A80L, XR-83A80L]","[139 cm (55), 164 cm (65), 195 cm (77), 210 cm...","[Rs.249,900, Rs.349,900, Rs.699,900, Rs.849,900]",4.6,793.0,[https://sony.scene7.com/is/image/sonyglobalso...,A80L | BRAVIA XR | OLED | 4K Ultra HD | High D...,Pitch black.Rich sound. : \nOur Pure Black OLE...,[ULTIMATE CONTRAST: High Peak Luminance | XR B...,Google TV,TVs


In [17]:
sony_df.to_csv("C:/Users/yurek/Downloads/sony_TV_Google-TV.csv", header=True, index=False)