In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os


In [13]:
# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# URL of the search pagesol
url = "https://www.cdiscount.com/search/10/fifa+25.html#_his_"  # Page containing multiple products
driver.get(url)

In [14]:
# List to store product data
product_data = []

try:
    # Find all products on the page
    products = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "abLabel"))  # Class of product container
    )

    for product in products:
        product_info = {}

        # Scrape the title
        try:
            title_element = product.find_element(By.CLASS_NAME, "prdtTit")
            product_info["Title"] = title_element.text.strip()
        except Exception as e:
            product_info["Title"] = "Error (Title)"
            print(f"Error (title): {e}")

        # Scrape the price
        try:
            price_element = product.find_element(By.CLASS_NAME, "c-price")
            # Remove unwanted characters (in case there's any unwanted encoding)
            price = price_element.text.strip().replace('â‚¬', '€')  # Fix encoding issues for euro sign
            product_info["Price"] = price
        except Exception as e:
            product_info["Price"] = "Error (Price)"
            print(f"Error (price): {e}")

        # Scrape the rating and number of reviews
        try:
            review_element = product.find_element(By.CLASS_NAME, "c-stars-rating")
            
            # Get rating from data-score
            rating_element = review_element.find_element(By.CLASS_NAME, "c-stars-result")
            rating = rating_element.get_attribute("data-score")
            rating = float(rating) / 20  # Convert score out of 100 to a 5-star rating
            product_info["Rating"] = f"{rating} stars"

            # Get the number of reviews
            reviews_count_element = review_element.find_element(By.CLASS_NAME, "c-stars-rating__label")
            reviews_count = reviews_count_element.text.strip()
            product_info["Reviews"] = reviews_count
        except Exception as e:
            product_info["Rating"] = "Error (Rating)"
            product_info["Reviews"] = "Error (Reviews)"
            print(f"Error (reviews and rating): {e}")

        # Add the product data to the list
        product_data.append(product_info)

except Exception as e:
    print(f"Error retrieving products: {e}")

Error (reviews and rating): Message: no such element: Unable to locate element: {"method":"css selector","selector":".c-stars-rating"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00EAFD53+23747]
	(No symbol) [0x00E37D54]
	(No symbol) [0x00D0BE53]
	(No symbol) [0x00D4FCA6]
	(No symbol) [0x00D4FEEB]
	(No symbol) [0x00D45A81]
	(No symbol) [0x00D71E44]
	(No symbol) [0x00D459A4]
	(No symbol) [0x00D72094]
	(No symbol) [0x00D8B41E]
	(No symbol) [0x00D71B96]
	(No symbol) [0x00D43F3C]
	(No symbol) [0x00D44EBD]
	GetHandleVerifier [0x0118AC73+3017699]
	GetHandleVerifier [0x0119B93B+3086507]
	GetHandleVerifier [0x011940F2+3055714]
	GetHandleVerifier [0x00F45AF0+637536]
	(No symbol) [0x00E40A5D]
	(No symbol) [0x00E3DA28]
	(No symbol) [0x00E3DBC5]
	(No symbol) [0x00E307F0]
	BaseThreadInitThunk [0x75BDFCC9+25]
	RtlGetAppContain

Error (reviews and rating): Message: no such element: Unable to locate element: {"method":"css selector","selector":".c-stars-rating"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00EAFD53+23747]
	(No symbol) [0x00E37D54]
	(No symbol) [0x00D0BE53]
	(No symbol) [0x00D4FCA6]
	(No symbol) [0x00D4FEEB]
	(No symbol) [0x00D45A81]
	(No symbol) [0x00D71E44]
	(No symbol) [0x00D459A4]
	(No symbol) [0x00D72094]
	(No symbol) [0x00D8B41E]
	(No symbol) [0x00D71B96]
	(No symbol) [0x00D43F3C]
	(No symbol) [0x00D44EBD]
	GetHandleVerifier [0x0118AC73+3017699]
	GetHandleVerifier [0x0119B93B+3086507]
	GetHandleVerifier [0x011940F2+3055714]
	GetHandleVerifier [0x00F45AF0+637536]
	(No symbol) [0x00E40A5D]
	(No symbol) [0x00E3DA28]
	(No symbol) [0x00E3DBC5]
	(No symbol) [0x00E307F0]
	BaseThreadInitThunk [0x75BDFCC9+25]
	RtlGetAppContain

In [15]:
# Convert product data into a DataFrame
df = pd.DataFrame(product_data)

# Define the Excel file path
excel_file = "cdiscount_data.xlsx"

# Check if the Excel file already exists
if os.path.exists(excel_file):
    # If the file exists, load it and append the new data
    existing_df = pd.read_excel(excel_file, engine='openpyxl')
    updated_df = pd.concat([existing_df, df], ignore_index=True)
    updated_df.to_excel(excel_file, index=False, engine='openpyxl')  # Save the updated data to Excel
else:
    # If the file does not exist, create a new one with headers
    df.to_excel(excel_file, index=False, engine='openpyxl')

In [32]:
# Close the browser
driver.quit()

In [3]:
# Function to clean Cdiscount data
def clean_cdiscount_data(file_path):
    try:
        # Read the Excel file
        df = pd.read_excel(file_path, engine='openpyxl')

        # Check the columns in the DataFrame
        print(f"Columns in the file: {df.columns.tolist()}")

        # Remove products with missing or "Error (Title)" titles
        df = df[df['Title'] != "Error (Title)"]
        df = df.dropna(subset=['Title'])  # Drop rows with empty or NaN titles

        # Remove products with "Error (Price)" or missing prices
        df = df[df['Price'] != "Error (Price)"]
        df['Price'] = df['Price'].str.replace("€", "").str.replace(",", ".").astype(float)  # Convert prices to float
        df = df.dropna(subset=['Price'])  # Drop rows with invalid or NaN prices

        # Remove products with "Error (Rating)" or missing ratings
        df = df[df['Rating'] != "Error (Rating)"]
        df['Rating'] = df['Rating'].str.extract(r"([0-9.]+)").astype(float)  # Extract numerical values
        df = df.dropna(subset=['Rating'])  # Drop rows with invalid or NaN ratings

        # Remove products with "Error (Reviews)" or missing reviews
        df = df[df['Reviews'] != "Error (Reviews)"]
        df['Reviews'] = df['Reviews'].str.replace(",", "").str.extract(r"(\d+)").astype(int)  # Extract numerical values
        df = df.dropna(subset=['Reviews'])  # Drop rows with invalid or NaN reviews

        # Add a column to identify the platform (Cdiscount PlatformID=2)
        df['PlatformID'] = 2  # PlatformID=2 for Cdiscount

        # Save the cleaned file
        cleaned_file_path = "cdiscount_data_cleaned.xlsx"
        df.to_excel(cleaned_file_path, index=False, engine='openpyxl')

        print(f"Cleaned file saved: {cleaned_file_path}")
        print(df.head())  # Display the first rows for validation

    except Exception as e:
        print(f"Error while cleaning data: {e}")


# Main function call
if __name__ == "__main__":
    # Path to the raw Cdiscount file
    file_path = "cdiscount_data.xlsx"

    # Check if the file exists
    if os.path.exists(file_path):
        clean_cdiscount_data(file_path)
    else:
        print(f"The file {file_path} does not exist.")


Colonnes du fichier : ['Title', 'Price', 'Rating', 'Reviews']
Fichier nettoyé sauvegardé : cdiscount_data_cleaned.xlsx
                                               Title   Price  Rating  Reviews  \
0  Pack PS5 Digital : Console PS5 Digitale (Modèl...  459.99     4.5     1831   
1  Pack PS5 Standard : Console PS5 (Modèle Slim) ...  599.99     4.5     1985   
2  Pack PS5 Digital : Console PS5 Digitale (Modèl...  499.99     4.5     1831   
3  Console PlayStation 5 - Edition Standard (Modè...  549.00     4.5     1985   
4  Pack PS5 Digital : Console PlayStation 5 (modè...  493.00     4.5     1831   

   PlatformID  
0           2  
1           2  
2           2  
3           2  
4           2  
