# Freeglisse Stock Scrapping

## Setup and Configuration

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import os
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Data Collection Process

### Test link response

In [3]:
r = requests.get("https://freeglisse.com/fr/12-ski-occasion?page=1")
print(r.status_code)

200


### Function to get all URLs products

In [33]:
def scrape_product_links(base_url):
    all_links = []
    page_num = 1
    while True:
        page_url = f"{base_url}?page={page_num}"  # Change the URL scheme according to your site

        # Get the HTML content of the page
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all h2 tags with class "h3 product-title"
            product_titles = soup.find_all('h2', class_='h3 product-title')

            # If no tags are found, it means we have reached the end of the pages
            if not product_titles:
                break

            # Extract links from these tags and add them to the list of all links
            for title in tqdm(product_titles, desc="scrape_product_links", unit='title'):
                link = title.find('a')['href']
                all_links.append(link)

            # Go to the next page
            page_num += 1
        else:
            # Stop the loop if an error occurs during the query
            print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
            break

    return all_links

In [None]:
base_url = 'https://freeglisse.com/fr/12-ski-occasion'  
product_links = scrape_product_links(base_url)
product_links

#Show all retrieved links
for link in tqdm(product_links):
    print(link)

### Function to get all links on a single page

In [4]:
def scrape_product_link(base_url):
    all_link = []

    # Get the HTML content of the page
    response = requests.get(base_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all h2 tags with class "h3 product-title"
        product_titles = soup.find_all('h2', class_='h3 product-title')

        # Extract links from these tags and add them to the list of all links
        for title in product_titles:
            link = title.find('a')['href']
            all_link.append(link)

    return all_link

In [None]:
base_url = 'https://freeglisse.com/fr/12-ski-occasion?page=8' 
product_link = scrape_product_link(base_url)
product_link

### Function to extract all datas we need of a single page

In [15]:
def products(url, quality_xpath, base_xpath, button_xpath, availability_xpath):

    driver = webdriver.Chrome() # setting up the Webdriver

    all_products_availability = {}

    try:
        driver.get(url)

        # Wait for quality items to load
        qualities = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, f"{quality_xpath}/li")))
        
        for quality_index in range(1, len(qualities) + 1):

            # Select quality
            quality_li_xpath = f"{quality_xpath}/li[{quality_index}]" # fetches the index of the quality button to increment and allow a search on the indexing
            quality_li_element = driver.find_element(By.XPATH, quality_li_xpath) # searches for the element on which we wish to interact (in this case the quality button)
            quality_text = quality_li_element.find_element(By.XPATH, "./label/span").text # will scrape the text from the quality button using the XPATH
            quality_li_element.click() # Click on the quality button
            
            # Allow time for the quality selection to update
            time.sleep(1)  
            product_availability = {}
            
            # Wait for the size items to be reloaded for the selected quality
            sizes = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, f"{base_xpath}/li")))
            
            for index in range(1, len(sizes) + 1):
                li_xpath = f"{base_xpath}/li[{index}]" # fetches the index of the size button to increment and allow searching on indexing
                li_element = driver.find_element(By.XPATH, li_xpath) # searches for the element on which you wish to interact (in this case the size button)
                
                if 'instock' in li_element.get_attribute('class'): # select size only if 'instock' is in XPATH
                    size_text = li_element.find_element(By.XPATH, "./label/span").text # will scrape the text from the size button using the XPATH
                    li_element.click() # Click on the size button

                    # Wait for the page to react to size selection
                    time.sleep(1) 
                    count = 0 # initialization of the clique counter (which will be taken into consideration to calculate the stock)
                    
                    while True:
                        availability_message = driver.find_element(By.XPATH, availability_xpath).text # will scrape the 'available' text via the XPATH
                        
                        if "Disponible" in availability_message: # if available message appears, continue to click
                            driver.find_element(By.XPATH, button_xpath).click() # click the button to increase the amount of skiing
                            count += 1 #increments the counter
                            time.sleep(1)  # Wait for the page to react to the click
                        else:
                            break 

                    product_availability[size_text] = count
                    time.sleep(1)  # Pause before moving to the next size
            
            all_products_availability[quality_text] = product_availability

    except Exception as e:
        print(f"Une erreur s'est produite : {e}")

    finally:
        driver.quit()
    return {url: all_products_availability}

### Scraping urls from all pages

In [None]:
base_url = 'https://freeglisse.com/fr/12-ski-occasion'  
product_link = scrape_product_links(base_url)
product_link

### Stock scraping according to quality and size of all pages on the site

In [None]:
quality_xpath = '/html/body/main/section/div/div/div/section/div[1]/div[2]/div[3]/div[2]/form/div[1]/div[1]/ul'
sizes_xpath = '/html/body/main/section/div/div/div/section/div[1]/div[2]/div[3]/div[2]/form/div[1]/div[2]/ul'
button_xpath = '//*[@id="add-to-cart-or-refresh"]/div[2]/div[1]/div[1]/div/span[3]/button[1]/i'
availability_xpath = '/html/body/main/section/div/div/div/section/div[1]/div[2]/div[3]/div[2]/form/div[2]/span/span'
stock_total = []

for link in tqdm(product_link, desc='products', unit='link'):
    stock = products(link, quality_xpath, sizes_xpath, button_xpath, availability_xpath)
    stock_total.append(stock)

stock_total

### Creation of the CSV with the results of the inventory

In [None]:
import csv

csv_file = 'stocks.csv'

# Open CSV file in write mode
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)

    # Write headers
    writer.writerow(['URL', 'Qualité', 'Taille', 'Quantité'])

    # Browse data
    for item in results:
        for url, quality_info in item.items():
            url_column = url

            # Browse quality information
            for quality, size_info in quality_info.items():
                quality_column = quality

                # Browse size and quantity information
                for size, quantity in size_info.items():
                    # Write a line to the CSV file
                    writer.writerow([url_column, quality_column, size, quantity])

print(f"Datas have been written in {csv_file}")


### Dataframe Display

In [47]:
df= pd.read_csv('stocks.csv')
df

Unnamed: 0,URL,Qualité,Taille,Quantité
0,https://freeglisse.com/fr/ski-occasion-adulte-...,Qualité A,156 cm,16
1,https://freeglisse.com/fr/ski-occasion-adulte-...,Qualité A,164 cm,14
2,https://freeglisse.com/fr/ski-occasion-adulte-...,Qualité A,172 cm,9
3,https://freeglisse.com/fr/ski-de-fond-occasion...,Qualité C,150 cm,11
4,https://freeglisse.com/fr/ski-de-fond-occasion...,Qualité C,160 cm,12
...,...,...,...,...
219,https://freeglisse.com/fr/ski-occasion-junior-...,Qualité A,120 cm,3
220,https://freeglisse.com/fr/ski-occasion-junior-...,Qualité A,110 cm,3
221,https://freeglisse.com/fr/ski-occasion-junior-...,Qualité B,130 cm,3
222,https://freeglisse.com/fr/ski-de-rando-occasio...,Qualité B,178 cm,3
