In [1]:
import os
import requests
import re
import json
import urllib
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

def save_to_json(data, website, category):
    # Create a folder for the website if it doesn't exist
    website_dir = os.path.join(os.getcwd(), website)
    os.makedirs(website_dir, exist_ok=True)

    # Create a folder for the category under the website directory
    category_dir = os.path.join(website_dir, category)
    os.makedirs(category_dir, exist_ok=True)

    # Define the filename (e.g., "Category.json")
    filename = os.path.join(category_dir, f"{category}.json")

    # Save data to the JSON file
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def scrape_zara_product(url):
    driver = webdriver.Chrome()
    driver.get(url)
    website = 'Zara'

    # Extract the title of the webpage
    webpage_title = driver.title

    # Split the title using the dash character
    title_parts = webpage_title.split('-')

    # Check if there are enough elements in title_parts
    if len(title_parts) >= 2:
        # Extract the category and color
        category_parts = title_parts[-2].strip().split()

        # Extract the last word from the category
        category = category_parts[-1]
    else:
        category = "Category Not Found"

    # Extract the color and additional info using the pipe character
    color_parts = title_parts[-1].strip().split('|')

    # Extract the color
    color = color_parts[0].strip()

    try:
        # Wait for the sizes element to be present for a maximum of 10 seconds
        sizes_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'size-selector-list__wrapper--grid-gap'))
        )

        # Locate the sizes
        size_items = sizes_element.find_elements(By.XPATH, './/li[@role="option"]')

        # Extract and print the sizes excluding "VIEW SIMILAR"
        sizes = [size_item.text for size_item in size_items if "VIEW SIMILAR" not in size_item.text]
        sizes_text = ', '.join(sizes)
        print("Sizes: " + sizes_text)

    except TimeoutException:
        # Handle the case where sizes are not present
        print("Sizes are not available for this product.")

    # Print the category and color
    print("Category: " + category)
    print("Color: " + color)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract and print the text content of the <h1> element with the specified class and attribute
    h1_element = soup.find('h1', class_='product-detail-info__header-name', attrs={'data-qa-qualifier': 'product-detail-info-name'})
    if h1_element:
        product_title = h1_element.get_text(strip=True)
        print("Product Title:", product_title)
    else:
        print("Product Title Not Found")
    
    # Save the print text data into a JSON file
    data = {
        "Sizes": sizes_text,
        "Category": category,
        "Color": color,
    }

    save_to_json(data, website, product_title)


    # Close the browser window
    driver.quit()
    
def scrape_pryka_product_with_images(url):
    def scrape_and_save_images(driver, website_name, category):
        # Extract the HTML content
        page_source = driver.page_source

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the image URLs in the specified ol element
        image_ol = soup.find('ol', class_='flex-control-nav flex-control-thumbs')
        if image_ol:
            image_urls = [img['src'] for img in image_ol.find_all('img', src=True)]
            num_images = len(image_urls)

            # Create a directory for the website if it doesn't exist
            website_dir = os.path.join(os.getcwd(), website_name)
            os.makedirs(website_dir, exist_ok=True)

            # Create a directory for the category if it doesn't exist
            category_dir = os.path.join(website_dir, category)
            os.makedirs(category_dir, exist_ok=True)

            # Create a directory for the product if it doesn't exist
            product_dir = os.path.join(category_dir, f"Product Images")
            os.makedirs(product_dir, exist_ok=True)

            # Download and save each image
            for i, image_url in enumerate(image_urls, start=1):
                response = requests.get(image_url)
                if response.status_code == 200:
                    image_extension = image_url.split('.')[-1]
                    filename = f"{i}-{num_images}.{image_extension}"
                    filepath = os.path.join(product_dir, filename)

                    with open(filepath, 'wb') as f:
                        f.write(response.content)

                    print(f"Image {i}/{num_images} saved: {filename}")
                else:
                    print(f"Failed to download image {i}: {image_url}")

            print(f"All {num_images} images saved to {product_dir}")

        else:
            print("No images found on the page.")

    driver = webdriver.Chrome()
    driver.get(url)
    
    # Extract the website name from the URL
    website_name = "Pryka"

    # Wait for the span with class "posted_in" to be present
    categories_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'posted_in'))
    )

    # Extract the categories
    categories = categories_element.find_elements(By.XPATH, './/a[@rel="tag"]')
    print(f"Website: {website_name}")
    
    # Extract and print the last category
    if categories:
        last_category = categories[-1].text
        print("Category: " + last_category)
    else:
        print("Category Not Found")
        
    if categories:
        print("Subcategory: ")
        for category in categories:
            print(category.text)
    else:
        print("Categories Not Found")
        
    # Extract the colors
    colors_element = driver.find_elements(By.XPATH, '//span[@data-sheets-value]')
    colors = []

    if colors_element:
        # Skip the first element and find all non-empty colors
        non_empty_colors = [color.text.strip() for color in colors_element[1:] if color.text.strip()]
        colors = non_empty_colors

    if colors:
        print("Colors:")
        for color in colors:
            print(color)
    
    # Extract and print the fabric
    fabric_element = driver.find_element(By.XPATH, '//span[@style="font-weight: 400;"]/span[@data-sheets-value]')
    fabric = fabric_element.text.strip()
    #print("Fabric:", fabric)
    

    # Click on the "Additional Information" tab
    additional_info_tab = driver.find_element(By.XPATH, '//a[@href="#tab-additional_information"]')
    additional_info_tab.click()

    # Wait for the sizes element to be present for a maximum of 10 seconds
    sizes_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//th[text()="Size"]/following-sibling::td/p'))
    )

    # Extract and print the sizes
    sizes_text = sizes_element.text.strip()
    print("Sizes:", sizes_text)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Extract and print the text content of the <h1> element with the specified class
    h1_element = soup.find('h1', class_='product_title entry-title')
    if h1_element:
        product_title = h1_element.get_text(strip=True)
        print("Product Title:", product_title)
    else:
        print("Product Title Not Found")
    
    scrape_and_save_images(driver, website_name, product_title)
   
    # Create a dictionary to store the information
    data = {
        "Website": website_name,
        "Category": last_category,
        "Subcategories": [category.text for category in categories] if categories else [],
        "Sizes": sizes_text,
    }
    
    save_to_json(data, website_name, product_title)

    # Close the browser window
    driver.quit()
    
def scrape_papa_product(url):
    
    driver = webdriver.Chrome()
    driver.get(url)
    website = "Papa Don't Preach"

    # Wait for the product title to be present
    product_title_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="product__title"]/h1'))
    )

    # Extract the full product title
    full_title = product_title_element.text.strip()

    # Find the index of the '-' symbol
    dash_index = full_title.find('-')

    if dash_index != -1 and dash_index + 1 < len(full_title):
        # Extract and print the text after the '-' symbol
        category = full_title[dash_index + 1:].strip()
        print("Category:", category)
    else:
        print("Category Not Found")

    # Wait for the breadcrumbs to be present
    breadcrumbs_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="page-width breadcrumbs"]'))
    )

    # Extract and print the breadcrumbs
    breadcrumbs = breadcrumbs_element.find_elements(By.XPATH, './/a')

    if breadcrumbs:
        print("Subcategories:")
        for breadcrumb in breadcrumbs[1:]:
            print(breadcrumb.text)
    else:
        print("Subcategories Not Found")

    # Click on the "Product Information" tab to activate it
    product_info_tab_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//li[@class="accordion-item"]/h3[text()="Product Information"]'))
    )
    product_info_tab_element.click()

    # Wait for the "Product Information" panel to be present
    product_info_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//p[@class="accordion-panel"]'))
    )

    # Wait for the product image to be present
    product_image_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="product__media media media--transparent"]/img'))
    )
    
    # Extract and print color and composition
    color = re.search(r'Color:\s*(\w+)', product_info_element.text)
    composition = re.search(r'Composition:\s*([\w\s,]+)', product_info_element.text)

    print("Color:", color.group(1) if color else "Not Found")
    print("Composition:", composition.group(1) if composition else "Not Found")
    
    
    # Extract color and composition
    color_match = re.search(r'Color:\s*(\w+)', product_info_element.text)
    composition_match = re.search(r'Composition:\s*([\w\s,]+)', product_info_element.text)

    color = color_match.group(1) if color_match else "Not Found"
    composition = composition_match.group(1) if composition_match else "Not Found"
    subcategories = [breadcrumb.text for breadcrumb in breadcrumbs[1:]] if breadcrumbs else []
    color = color_match.group(1) if color_match else "Not Found"
    composition = composition_match.group(1) if composition_match else "Not Found"
    
    # Create a dictionary to store the information
    data = {
        "Website": website,
        "Category": category,
        "Subcategories": subcategories,
        "Color": color,
        "Composition": composition
    }
    
    save_to_json(data, website, category)

    
    # Close the browser window
    driver.quit()
    
    
def scrape_kshitijjalori_product(url):
    driver = webdriver.Chrome()
    driver.get(url)
    website = "Kshitij Jalori"

    # Wait for the product title to be present
    product_title_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//h1[@class="h2 product-single__title"]'))
    )

    # Extract the product title
    product_title = product_title_element.text.strip()

    # Remove numbers and "/"
    product_title_cleaned = re.sub(r'^\d+/\d+\s+', '', product_title)
    print("Category:", product_title_cleaned)

    # Create folder structure
    folder_structure = os.path.join(website, product_title_cleaned)
    os.makedirs(folder_structure, exist_ok=True)

    # Wait for the product description to be present
    product_description_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="product-single__description rte"]'))
    )

    # Extract and print the product details excluding the first <p>
    product_details_elements = product_description_element.find_elements(By.XPATH, './/p')[1:]
    product_details = '\n'.join([element.text.strip() for element in product_details_elements])
    print("Product Details:")
    print(product_details)

    # Extract image URL
    image_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="image-wrap"]/img[contains(@class, "photoswipe__image")]'))
    )
    image_relative_url = image_element.get_attribute('data-photoswipe-src')
    image_absolute_url = f'http:{image_relative_url}'

    # Download and save the image
    image_filename = f"{product_title_cleaned}.jpg"
    image_filepath = os.path.join(folder_structure, image_filename)
    urllib.request.urlretrieve(image_absolute_url, image_filepath)
    print(f"Image downloaded and saved: {image_filepath}")
    
    data = {
        "Website": website,
        "Category": product_title_cleaned,
        "ProductDetails": product_details
    }
    
    save_to_json(data, website, product_title_cleaned)

    # Close the browser window
    driver.quit()

    
def scrape_gucci_website(url):
    driver = webdriver.Chrome()
    driver.get(url)
    website = "Gucci"
    
    # Wait for the title meta tag to be present
    title_meta_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'meta[name="title"]'))
    )

    # Extract the content of the title meta tag
    category_full = title_meta_element.get_attribute("content")

    # Split the category by '|', take the first part, and remove leading/trailing whitespaces
    category_unsplit = category_full.split('|')[0].strip()
    
    parts = category_unsplit.split('in')
    
    category = parts[0].strip()
    color = parts[1].strip() if len(parts) > 1 else ''  # Handle the case where 'in' is not present

    print("Category:", category)
    print("Color:", color)
    
    # Find the accordion drawer and scroll into view
    accordion_drawer = driver.find_element(By.ID, 'product-details')
    driver.execute_script("arguments[0].scrollIntoView();", accordion_drawer)

    # Wait for the details to be present
    details_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'product-detail'))
    )

    # Extract and print the details using BeautifulSoup
    soup = BeautifulSoup(details_element.get_attribute("outerHTML"), 'html.parser')
    details_items = soup.select('.product-detail li')
    
    # Define keywords to exclude
    exclude_keywords = ['made in', 'the model is', 'shoulder', 'back length', 'guarantees', 'image']

    # Filter out lines containing exclude_keywords
    details_text = '\n'.join(item.get_text(strip=True) for item in details_items if not any(keyword in item.get_text(strip=True).lower() for keyword in exclude_keywords))

    print("Details:")
    print(details_text)
    
    data = {
        "Website": website,
        "Category": category,
        "Color": color,
        "ProductDetails": details_text
    }
    
    save_to_json(data, website, category)
    
    # Close the browser window
    driver.quit()
    
    
# Example usage:
url = 'https://pryka.in/product/off-white-bloom-maxi-dress/'
if 'zara.com' in url:
    scrape_zara_product(url)
elif 'pryka.in' in url:
    scrape_pryka_product_with_images(url)
elif 'papadontpreach.com' in url:
    scrape_papa_product(url)
elif 'kshitijjalori.com' in url:
    scrape_kshitijjalori_product(url)
elif 'gucci.com' in url:
    scrape_gucci_website(url)
else:
    print("Unsupported website.")


Website: Pryka
Category: Dress & Maxis
Subcategory: 
Bali
Clothes
Collections
Dress & Maxis
Sizes: L, M, S, XL, XS, XXL
Product Title: Margarita by the Beach Maxi Dress
Image 1/6 saved: 1-6.jpg
Image 2/6 saved: 2-6.jpg
Image 3/6 saved: 3-6.jpg
Image 4/6 saved: 4-6.jpg
Image 5/6 saved: 5-6.jpg
Image 6/6 saved: 6-6.jpg
All 6 images saved to D:\VS Code Projects\Pryka Intern\Pryka\Margarita by the Beach Maxi Dress\Product Images
