# Scraping DCG.media

In [87]:
### SCRAPING
import requests as rq
from bs4 import BeautifulSoup

### DATABASES
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

### VISUALISATION
#import plotly.express as px

### DIVERS
from tqdm import tqdm

### FORMAT
from datetime import datetime, timezone
import time
import json

## Définition des variables

In [88]:
website = 'freeglisse'
abbr = 'freeglisse'
tld = '.com'

In [89]:
page_number = 1
top_url = f'https://www.{website+tld}/fr/12-ski-occasion'
top_search_url = f'{top_url}?page={page_number}'

## Définition des fonctions

### Obtenir le nombre de pages de résultats de recherche

In [90]:
def get_max_page_results(top_search_url):
    r = rq.get(top_search_url)
    soup = BeautifulSoup(r.content)
    page_final = soup.find_all('a', class_ = 'js-search-link')

    return int(page_final[-2].text)
get_max_page_results(top_search_url)

36

### Obtenir la liste des pages de résultats de recherche

In [91]:
def get_all_results_pages(page_final):
    urls = []
    for i in range(page_final):
        i = f'{top_url}?page={i+1}'
        urls.append(i)
    return urls
get_all_results_pages(get_max_page_results(top_search_url))

['https://www.freeglisse.com/fr/12-ski-occasion?page=1',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=2',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=3',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=4',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=5',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=6',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=7',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=8',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=9',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=10',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=11',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=12',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=13',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=14',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=15',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=16',
 'https://www.freeglisse.com/fr/12-ski-occasion?page=17',
 'https://www.freegliss

### Obtenir tous les liens des articles sur une page de recherche

In [92]:
def get_articles_on_page(search_results):
    urls_articles = []
    s = rq.Session()
    for p in tqdm(search_results):
        r = s.get(p)
        soup = BeautifulSoup(r.content)
        articles = soup.find_all('h2', class_ = 'h3 product-title')
        for article in articles:
            url = article.find('a', href=True)['href']
            urls_articles.append(url)
    return urls_articles

In [93]:
all=get_articles_on_page(get_all_results_pages(get_max_page_results(top_search_url)))

100%|██████████| 36/36 [00:40<00:00,  1.12s/it]


In [94]:
print(len(all))

844


### Récupérer les infos qu'on souhaite sur chaque article

In [95]:
import pandas as pd
from bs4 import BeautifulSoup

def get_article_info(r, url_article):
    soup = BeautifulSoup(r.content, 'lxml')
    
    # Initialiser les données avec des valeurs par défaut
    data = {
        "url": url_article,
        "name": None,
        "price_valid_date": None,
        "available": None,
        "reg_price": None,
        "current_price": None,
        "cats": None,
        "ref": None,
        "color": None,
        "brand": None,
        "desc": None,
        "quality": None,
        "size": None,
        "weight": None,
        "weight_type": None,
        "img": None,
        "pretax_price": None,
        "type": None,
        "user": None,
        "level": None,
        "co2": None,
        "product_type": None,
        "page_has_issues": False,
        "review": None,
        "rating": None,
        "nb_rating": None,
        "nb_1": None,
        "nb_2": None,
        "nb_3": None,
        "nb_4": None,
        "nb_5": None
    }

    try:
        data["name"] = soup.find("title").text
    except:
        data["page_has_issues"] = True

    try:
        data["price_valid_date"] = None
    except:
        data["page_has_issues"] = True

    try:
        data["available"] = soup.find("span", {"id": "availability_message"}).text.strip()
    except:
        data["page_has_issues"] = True

    try:
        data["reg_price"] = None
    except:
        data["page_has_issues"] = True

    try:
        data["current_price"] = float(soup.find("span", {"class": "current-price-value"})["content"])
    except:
        data["page_has_issues"] = True

    try:
        data["cats"] =  ", ".join([cat.text.strip() for cat in soup.find_all("span", {"class": "rb-items"})])
    except:
        data["page_has_issues"] = True

    try:
        data["ref"] = soup.find("meta", {"property": "product:sku"})["content"]
    except:
        data["page_has_issues"] = True

    try:
        data["color"] = None  # La couleur n'est pas clairement spécifiée dans l'exemple
    except:
        data["page_has_issues"] = True

    try:
        data["brand"] = soup.find("meta", {"property": "product:brand"})["content"]
    except:
        data["page_has_issues"] = True

    try:
        data["desc"] = soup.find("meta", {"property": "og:description"})["content"]
    except:
        data["page_has_issues"] = True

    # Qualités et tailles (ajustez les sélecteurs si nécessaires)
    try:
        data["quality"] = None # Spécifiez un sélecteur si présent
    except:
        data["page_has_issues"] = True
    
    try:
        data["size"] = None     # Spécifiez un sélecteur si présent
    except:
        data["page_has_issues"] = True

    try:
        data["weight"] = float(soup.find("meta", {"property": "product:weight:value"})["content"])
    except:
        data["page_has_issues"] = True

    try:
        data["weight_type"] = soup.find("meta", {"property": "product:weight:units"})["content"]
    except:
        data["page_has_issues"] = True

    try:
        data["img"] = soup.find("meta", {"property": "og:image"})["content"]
    except:
        data["page_has_issues"] = True

    try:
        data["pretax_price"] = float(soup.find("meta", {"property": "product:pretax_price:amount"})["content"])
    except:
        data["page_has_issues"] = True

    # Informations product-features
    try:
        data["type"] = None
    except:
        data["page_has_issues"] = True
        
    try:
        data["user"] = None
    except:
        data["page_has_issues"] = True
        
    try:
        data["level"] = None
    except:
        data["page_has_issues"] = True
        
    try:
        data["co2"] = None
    except:
        data["page_has_issues"] = True
        
    try:
        data["product_type"] = None
    except:
        data["page_has_issues"] = True

    # Notes et évaluations
    try:
        data["review"] = None  # Remplir si un sélecteur est défini
        data["rating"] = None  # Remplir si un sélecteur est défini
        data["nb_rating"] = None
        for i in range(1, 6):
            data[f"nb_{i}"] = None
    except:
        data["page_has_issues"] = True

    return pd.Series(data)


## Fonction finale

In [96]:
article_list = ['https://freeglisse.com/fr/ski-de-fond-occasion-alternatif-sns-profil/18974-434106-ski-de-fond-occasion-rossignol-lts-junior-fixation-sns-profil.html#/892-taille_skis-150_cm/1768-etat_du_materiel-qualite_c']

### Fonction finale

In [97]:
# Import des packages
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Initialisation du compteur
start_time = time.time()

# Fonction principale qui sauvegarde le DataFrame dans un fichier CSV et le retourne
def scrape_all_articles(urls, batch_size=250, max_workers=20):
    # Initialisation d'une session réutilisable
    session = rq.Session()

    # Création d'une fonction qui traite chaque URL
    def process_url(url):
            try:
                r = session.get(url)
                return get_article_info(r, url)
            except Exception as e:
                print(f"Erreur pour {url}: {str(e)}")
                return pd.Series(name=url)

    # Utilise ThreadPoolExecutor pour paralléliser le traitement
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Parcourt la liste d'URLs par lots de taille 'batch_size'
        for i in range(0, len(urls), batch_size):
            # Extrait un lot d'URLs
            batch = urls[i:i+batch_size]
            # Crée et soumet des tâches pour chaque URL du lot
            futures = [executor.submit(process_url, url) for url in batch]
            
            # Traite les résultats au fur et à mesure qu'ils sont terminés
            for future in tqdm(as_completed(futures), total=len(batch), desc=f"Batch {i//batch_size + 1} / {len(urls)//batch_size+1}"):
                # Récupère le résultat de la tâche
                result = future.result()
                # Si le résultat n'est pas None (pas d'erreur), l'ajoute aux résultats
                if result is not None:
                    results.append(result)
    
    # Création du DataFrame final
    df_final = pd.DataFrame(results)

    # Ajout de colonnes calculées
    df_final["website"] = "https://freeglisse.com"
    df_final["scraping_date"] = pd.Timestamp.now(tz="UTC")

    # Définir les noms de colonnes explicitement
    column_names = [
        "url", "name", "price_valid_date", "available", "reg_price", "current_price",  # Informations génériques
        "cats", "ref", "color", "brand", "desc",  # Catégorie et description
        "quality", "size",  # Qualités et tailles
        "weight", "weight_type", "img", "pretax_price",  # Meta-informations
        "type", "user", "level", "co2", "product_type",  # Informations product-features
        "page_has_issues",  # Problèmes de page
        "review", "rating", "nb_rating",  # Informations sur les notes
        "nb_1", "nb_2", "nb_3", "nb_4", "nb_5"  # Notes détaillées
    ]
    df_final = df_final.reindex(columns=column_names, fill_value=pd.NA)  # Réorganiser les colonnes


    # Sauvegarde en CSV
    df_final.to_csv(f'scraping_{abbr}.csv', sep='|', index=True)
    print(f"Scraping terminé. Total d'articles : {len(df_final)}")
    return df_final

df = scrape_all_articles(article_list)

end_time = time.time()
print(end_time - start_time)

Batch 1 / 1:   0%|          | 0/1 [00:00<?, ?it/s]

Batch 1 / 1: 100%|██████████| 1/1 [00:00<00:00,  1.74it/s]

Scraping terminé. Total d'articles : 1
0.5899395942687988





In [98]:
# Création d'un rapport de profil avec pandas_profiling
profile = ProfileReport(df, title=f"{abbr.title()} Scraping Report", explorative=True)

# Génération du rapport au format HTML
profile.to_file(f"scraping_report_{abbr}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)
  series = series.fillna(np.nan)


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]