In [25]:
##Right One
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

def get_image_links(product_url, base_url):
    print(f"Fetching images from product page: {product_url}")
    
    full_product_url = urljoin(base_url, product_url)
    
    response = requests.get(full_product_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the product page: {full_product_url}. Status code: {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    image_links = [img['src'] for img in soup.find_all('img') if 'src' in img.attrs and "http://202.5.28.66/img/" in img['src']]
    
    return image_links

def scrape_page(content, base_url):
    soup = BeautifulSoup(content, 'html.parser')
    products = []
    
    # Debug: Print out the content to check what’s being scraped
    print("Page Content:")
    print(soup.prettify())  # This will print the entire HTML content of the page

    for div in soup.find_all('div', class_='gallary_item album'):
        main_page_images = []
        for img in div.find_all('img', style='max-width: 180px;max-height: 180px;'):
            link = img.get('src', '')
            if link:
                if not link.startswith('http'):
                    link = urljoin(base_url, link.split('/')[-1])
            main_page_images.append(link)
        
        title = ''
        span = div.find('span', class_='name')
        if span:
            title = span.text.strip()
        
        price = ''
        p_tag = div.find('p')
        if p_tag and p_tag.find('strong'):
            price = p_tag.find('strong').text.strip()
        
        product_link = ''
        a_tag = div.find('a', target='_blank')
        if a_tag and 'href' in a_tag.attrs:
            product_link = a_tag['href']

        if title or price or product_link:  # Only add products with at least some data
            products.append([title, main_page_images, [], price, product_link])
    
    for product in products:
        product_link = product[4]
        if product_link:
            image_links = get_image_links(product_link, base_url)
            product[2] = image_links  # Replace with the new list of additional images

    print(f"Scraped {len(products)} products from the current page.")
    return products

def scrape_index_file(url):
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    driver.get(url)
    wait = WebDriverWait(driver, 10)

    all_products = []
    base_url = 'http://www.jcluxuryshop.store/'
    max_pages = 2  # Set the maximum number of pages to scrape

    # Scrape the landing page
    print("Scraping the landing page.")
    products = scrape_page(driver.page_source, base_url)
    all_products.extend(products)

    # Handle pagination
    page_counter = 0
    while page_counter < max_pages:
        try:
            # Click the "Next" button
            next_page_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@name='whj_nextPage']")))
            next_page_button.click()
            print(f"Navigating to page {page_counter + 2}.")
            time.sleep(5)  # Wait for the page to load

            # Scrape the next page
            products = scrape_page(driver.page_source, base_url)
            all_products.extend(products)
            
            page_counter += 1  # Increment the page counter
        except Exception as e:
            print(f"Error while navigating to the next page: {e}")
            break

    driver.quit()

    # Determine the maximum number of images in any product to adjust the columns dynamically
    max_main_images = max(len(product[1]) for product in all_products)
    max_additional_images = max(len(product[2]) for product in all_products)

    # Write the data to the CSV file
    with open('right.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        header = ['Title'] + [f'Main Image {i+1}' for i in range(max_main_images)] + ['Additional Images'] + ['Price']
        writer.writerow(header)
        for product in all_products:
            row = [product[0]] + product[1] + [''] * (max_main_images - len(product[1])) + [', '.join(product[2])] + [product[3]]
            writer.writerow(row)

    print("Data has been written to products.csv")

# URL to the online web page
url = 'http://www.jcluxuryshop.store/show/main.html?u=jc'
scrape_index_file(url)



Scraping the landing page.
Page Content:
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <base href="http://www.jcluxuryshop.store:80/"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Luxury Bags, Shoes, Clothes, Belts Hot Sale, Worldwild Shipping
  </title>
  <meta content="Louboutin, GZ, MCM, Yeezy" name="keywords"/>
  <meta content="Luxury Bags, Shoes, Clothes Wholesale, Worldwild Shipping" name="description"/>
  <style>
   #copy_notice {
	position: absolute;
	z-index: 1103;
	height: 15px;
	width: 80px;
	padding: 5px;
	border: 1px solid #eee;
	background: #FFFFEE;
}
  </style>
  <meta content="MeiuPic 2.2.0" name="generator"/>
  <link href="http://www.jcluxuryshop.store:80/source/img/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="http://www.jcluxuryshop.store:80/source/css/main.css" rel="stylesheet" type="text/css"/>
  <link href="http://www.jcluxuryshop.store:80/source/css/style.css" rel="stylesheet" type="text/css"/

Fetching images from product page: show/detail.html?u=jc&id=137438
Fetching images from product page: show/detail.html?u=jc&id=137437
Fetching images from product page: show/detail.html?u=jc&id=137436
Fetching images from product page: show/detail.html?u=jc&id=137435
Fetching images from product page: show/detail.html?u=jc&id=137434
Fetching images from product page: show/detail.html?u=jc&id=137433
Fetching images from product page: show/detail.html?u=jc&id=137432
Fetching images from product page: show/detail.html?u=jc&id=137431
Fetching images from product page: show/detail.html?u=jc&id=137430
Fetching images from product page: show/detail.html?u=jc&id=137429
Fetching images from product page: show/detail.html?u=jc&id=137428


KeyboardInterrupt: 

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

# Path to your chromedriver executable
chromedriver_path = 'C:\\Users\\anask\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe'

# Setup Chrome WebDriver with ChromeDriver
options = Options()
# options.add_argument("--headless")  # Uncomment to run Chrome in headless mode
driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)

# URL of the webpage
url = "https://www.daraz.pk/#?"

# Open the URL
driver.get(url)

# Wait for the search box element to be present and interactable
wait = WebDriverWait(driver, 10)
search = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "search-box__input--O34g")))

# Search for 'Ear Buds'
search.send_keys('Ear Buds')

# Click the search button
search_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "search-box__button--1oH7")))
search_button.click()

# Give the page time to load
time.sleep(5)

# Lists to store the scraped data
Names = []
Prices = []
Discounts = []
Solds = []
Links = []

# Page counter
page_counter = 0
max_pages = 2  # Set the maximum number of pages to scrape

# Loop through the pages
while True:
    try:
        # Wait until the data elements are loaded
        wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "buTCk")))

        # Get page source and parse it with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all the data elements on the current page using BeautifulSoup
        data = soup.find_all('div', class_='buTCk')

        for item in data:
            # Extract and append the product name
            try:
                name_element = item.find('div', class_='RfADt')
                if name_element:
                    Name = name_element.get_text(strip=True)
                    Link = name_element.find('a')['href']
                else:
                    Name = ""
                    Link = ""
            except Exception as e:
                print(f"Error extracting name: {e}")
                Name = ""
                Link = ""
            Names.append(Name)
            Links.append(Link)
            
            # Extract and append the product price
            try:
                price_element = item.find('span', class_='ooOxS')
                Price = price_element.get_text(strip=True) if price_element else ""
            except Exception as e:
                print(f"Error extracting price: {e}")
                Price = ""
            Prices.append(Price)
            
            # Extract and append the product discount
            try:
                discount_element = item.find('span', class_='IcOsH')
                Discount = discount_element.get_text(strip=True) if discount_element else ""
            except Exception as e:
                print(f"Error extracting discount: {e}")
                Discount = ""
            Discounts.append(Discount)
            
            # Extract and append the number of items sold
            try:
                sold_element = item.find('span', class_='_1cEkb')
                Sold = sold_element.get_text(strip=True) if sold_element else ""
            except Exception as e:
                print(f"Error extracting sold items: {e}")
                Sold = ""
            Solds.append(Sold)

        # Increment the page counter
        page_counter += 1
        print(f"Page {page_counter} scraped.")

        # Check if the maximum number of pages has been reached
        if page_counter >= max_pages:
            print("Maximum number of pages reached.")
            break

        # Try to click the next button to go to the next page
        try:
            # Wait for the "Next" button with the specific class to be clickable
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, ".ant-pagination-next button"))
            )
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)  # Wait for the next page to load
        except Exception as e:
            print(f"Error navigating to next page: {e}")
            print("No more pages or unable to click the next button.")
            break

    except Exception as e:
        print(f"Error on page {page_counter}: {e}")
        break

# Save the scraped data to a CSV file
pd.DataFrame({
    'Name': Names,
    'Price': Prices,
    'Discount': Discounts,
    'Sold': Solds,
    'Links': Links
}).to_csv('Daraz_new2.csv', index=False)

# Close the WebDriver
driver.quit()


Page 1 scraped.
Page 2 scraped.
Maximum number of pages reached.
