# Freeglisse Products Reviews Scrapping

## Setup and Configuration

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re

## Data collection process

### Function to get all URLs products

In [2]:
def get_all_product_urls(base_url):
    all_product_urls = []
    current_page = 1
    while True:
        # Build the current page URL by adding the pagination parameter
        url = f"{base_url}?page={current_page}"
        response = requests.get(url)
        # Break the loop if the page request was unsuccessful
        if response.status_code != 200:
            break 

        soup = BeautifulSoup(response.content, 'html.parser')
        products = soup.find_all('div', class_='product')

        # Find the URL for each product and add it to the list
        page_product_urls = [p.find('a')['href'] for p in products if p.find('a')]
        if not page_product_urls:
            break  # Stop if no product URLs are found on the page

        all_product_urls.extend(page_product_urls)

        # Check if there is a next page
        next_button = soup.find('a', rel='next')
        if not next_button or 'disabled' in next_button.get('class', []):
            break  # Stop if there is no 'next' button or if it's disabled
        
        current_page += 1  # Increment the page number
        
    return all_product_urls


# URL of the first page of products
base_url = 'https://freeglisse.com/fr/12-ski-occasion'
product_urls = get_all_product_urls(base_url)



### Function to extract all datas we need of a single page

In [4]:
def scrape_product_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the canonical link; use conditional to check for the tag's existence
    canonical_link = soup.find('link', rel='canonical')['href'] if soup.find('link', rel='canonical') else None
    
    # Extract the client's name
    client_name_tag = soup.find('p', class_='netreviews_customer_name')
    # Remove surrounding quotes if present and check for tag's existence
    client_name = client_name_tag.get_text(strip=True).strip('"') if client_name_tag else None
    
    # Extract the comment's publication date
    # Use find_next_sibling to find the date related to the client name tag
    comment_date_tag = client_name_tag.find_next_sibling('span') if client_name_tag else None
    # Split the text to get the date after 'le' and strip any whitespace
    comment_date = comment_date_tag.get_text(strip=True).split('le')[-1].strip() if comment_date_tag else None


    # Extract the order date
    order_date = None
    order_date_tag = soup.find('span', class_='order_date')
    if order_date_tag:
        # Split the text to get the date after 'du' and strip any whitespace
        order_date = order_date_tag.get_text(strip=True).split('du')[-1].strip()


    # Extract the rating
    rating = None
    rating_tag = soup.find('div', class_='netreviews_reviews_rate')
    if rating_tag:
        # Extract the rating text directly
        rating = rating_tag.get_text(strip=True)

    # Extract the comment
    comment = None
    comment_tag = soup.find('p', class_='netreviews_customer_review')
    if comment_tag:
        # Extract the comment text directly
        comment = comment_tag.get_text(strip=True)

    # Construct and return a dictionary of the product's information
    return {
        'link': canonical_link,
        'client_name': client_name,
        'order_date': order_date,
        'comment_date': comment_date,
        'rating': rating,
        'comment': comment
    }


### Extract datas of all pages

In [5]:
# URL of the first page of the products
base_url = 'https://freeglisse.com/fr/12-ski-occasion'
product_urls = get_all_product_urls(base_url)

# List to store details of all products
all_product_details = []

# Browse each product URL and retrieve details
for url in product_urls:
    details = scrape_product_page(url)
    all_product_details.append(details)

all_product_details

[{'link': 'https://freeglisse.com/fr/ski-occasion-adulte-freeride-et-freestyle/18916-ski-occasion-rossignol-sender-104-ti-2023-fixations.html',
  'client_name': 'FERNAND L.publié le 17/04/2024suite à une commande du 09/04/2024',
  'order_date': '09/04/2024',
  'comment_date': None,
  'rating': '5/5',
  'comment': "Hormis le problème de fixations après avoir pris le soins de les lustré préparé les cares et fartage la notation qualité A n'ai pas mis en  défaut."},
 {'link': 'https://freeglisse.com/fr/ski-occasion-adulte-freeride-et-freestyle/18915-ski-occasion-rossignol-sender-94-ti-2023-fixations.html',
  'client_name': 'Peio A.publié le 11/11/2023suite à une commande du 01/11/2023',
  'order_date': '01/11/2023',
  'comment_date': None,
  'rating': '4/5',
  'comment': 'good cuality'},
 {'link': 'https://freeglisse.com/fr/ski-de-fond-occasion-alternatif-norme-sns/18974-ski-de-fond-occasion-rossignol-lts-junior-fixation-sns-profil.html',
  'client_name': None,
  'order_date': None,
  'com

### Convert result into a Dataframe

In [6]:
df = pd.DataFrame(all_product_details)

In [7]:
df.columns

Index(['link', 'client_name', 'order_date', 'comment_date', 'rating',
       'comment'],
      dtype='object')

## Data cleaning and preparation

In [8]:
# Apply extraction on 'client_name' column
df[['client_name_clean', 'published_date']] = df['client_name'].str.extract(r'^(.*?)publié le (\d{2}/\d{2}/\d{4})')

# Convert extracted date strings to datetime
df['published_date'] = pd.to_datetime(df['published_date'], format='%d/%m/%Y', errors='coerce')

# Show results to check
print(df[['client_name_clean', 'published_date']].head())
df

  client_name_clean published_date
0        FERNAND L.     2024-04-17
1           Peio A.     2023-11-11
2               NaN            NaT
3        Etienne H.     2023-01-30
4          Marie D.     2024-02-25


Unnamed: 0,link,client_name,order_date,comment_date,rating,comment,client_name_clean,published_date
0,https://freeglisse.com/fr/ski-occasion-adulte-...,FERNAND L.publié le 17/04/2024suite à une comm...,09/04/2024,,5/5,Hormis le problème de fixations après avoir pr...,FERNAND L.,2024-04-17
1,https://freeglisse.com/fr/ski-occasion-adulte-...,Peio A.publié le 11/11/2023suite à une command...,01/11/2023,,4/5,good cuality,Peio A.,2023-11-11
2,https://freeglisse.com/fr/ski-de-fond-occasion...,,,,,,,NaT
3,https://freeglisse.com/fr/ski-de-fond-occasion...,Etienne H.publié le 30/01/2023suite à une comm...,16/01/2023,,4/5,Je n'ai pas encore pu les tester car pas de ch...,Etienne H.,2023-01-30
4,https://freeglisse.com/fr/ski-occasion-femme-l...,Marie D.publié le 25/02/2024suite à une comman...,04/02/2024,,5/5,Etat du produit conforme à mes attentes!,Marie D.,2024-02-25
...,...,...,...,...,...,...,...,...
164,https://freeglisse.com/fr/ski-de-fond-occasion...,,,,,,,NaT
165,https://freeglisse.com/fr/ski-occasion-adulte-...,,,,,,,NaT
166,https://freeglisse.com/fr/ski-occasion-adulte-...,Nadir B.publié le 16/11/2023suite à une comman...,08/11/2023,,4/5,Correct,Nadir B.,2023-11-16
167,https://freeglisse.com/fr/ski-occasion-junior-...,Client anonymepublié le 02/11/2022suite à une ...,24/10/2022,,5/5,Ensemble raisonnable,Client anonyme,2022-11-02


In [9]:
df = df.drop(columns='client_name')

In [10]:
df = df.rename(columns={'client_name_clean': 'client_name'})

In [11]:
df = df.drop(columns='comment_date')

In [12]:
# Set the columns you want to check for empty values
columns_to_check = ['order_date', 'rating', 'comment', 'client_name', 'published_date']

# Use dropna method with 'how' argument set to 'all' to remove rows
# where all columns listed are NaN
df = df.dropna(subset=columns_to_check, how='all')

In [15]:
# The function to shorten URLs
def shorten_url(url):
    # Find the part of the URL to keep (up to the second '-')
    match = re.match(r"(https://freeglisse.com/fr/.+?/\d+)-", url)
    if match:
        # Return the part of the URL that precedes the unnecessary sequence
        return match.group(1)
    else:
        # If the pattern is not found, return the original URL
        return url

# Apply the function to the 'link' column of the dataframe
df['link'] = df['link'].apply(shorten_url)



In [16]:
df['order_date'] = pd.to_datetime(df['order_date'], format='%d/%m/%Y', errors='coerce')

In [17]:
columns = ['link', 'client_name', 'comment', 'rating', 'order_date', 'published_date']

In [18]:
df = df[columns]

In [19]:
df['days_difference'] = (df['published_date'] - df['order_date']).dt.days

In [20]:
df.head()

Unnamed: 0,link,client_name,comment,rating,order_date,published_date,days_difference
0,https://freeglisse.com/fr/ski-occasion-adulte-...,FERNAND L.,Hormis le problème de fixations après avoir pr...,5/5,2024-04-09,2024-04-17,8
1,https://freeglisse.com/fr/ski-occasion-adulte-...,Peio A.,good cuality,4/5,2023-11-01,2023-11-11,10
2,https://freeglisse.com/fr/ski-de-fond-occasion...,Etienne H.,Je n'ai pas encore pu les tester car pas de ch...,4/5,2023-01-16,2023-01-30,14
3,https://freeglisse.com/fr/ski-occasion-femme-l...,Marie D.,Etat du produit conforme à mes attentes!,5/5,2024-02-04,2024-02-25,21
4,https://freeglisse.com/fr/ski-occasion-junior-...,Delphine D.,"skis en bon état, peu utilisés, assez rapides",4/5,2024-03-29,2024-04-09,11


In [21]:
df.dtypes

link                       object
client_name                object
comment                    object
rating                     object
order_date         datetime64[ns]
published_date     datetime64[ns]
days_difference             int64
dtype: object

In [22]:
df['rating'] = df['rating'].str[0].astype(int)

In [23]:
df.head()

Unnamed: 0,link,client_name,comment,rating,order_date,published_date,days_difference
0,https://freeglisse.com/fr/ski-occasion-adulte-...,FERNAND L.,Hormis le problème de fixations après avoir pr...,5,2024-04-09,2024-04-17,8
1,https://freeglisse.com/fr/ski-occasion-adulte-...,Peio A.,good cuality,4,2023-11-01,2023-11-11,10
2,https://freeglisse.com/fr/ski-de-fond-occasion...,Etienne H.,Je n'ai pas encore pu les tester car pas de ch...,4,2023-01-16,2023-01-30,14
3,https://freeglisse.com/fr/ski-occasion-femme-l...,Marie D.,Etat du produit conforme à mes attentes!,5,2024-02-04,2024-02-25,21
4,https://freeglisse.com/fr/ski-occasion-junior-...,Delphine D.,"skis en bon état, peu utilisés, assez rapides",4,2024-03-29,2024-04-09,11


## Export datas in CSV file

In [24]:
df.to_csv('reviews.csv')