In [None]:
"https://fr.trustpilot.com/categories/food_beverages_tobacco?location=paris&numberofreviews=0&page=2&status=all&timeperiod=0"
"https://fr.trustpilot.com/categories/food_beverages_tobacco?location=paris&numberofreviews=0&status=all&timeperiod=0"
"https://fr.trustpilot.com/categories/food_beverages_tobacco?page=2&location=paris&timeperiode=0&status=all&numberofreviews=0"

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import random
import time

In [255]:
def scrap_companies_from_search_category(category, location=False, numberofreviews=0, status="all",
                                         timeperiode=0, page=1, verbose=0, scrap_best_score=True,
                                        max_companies=-1, max_reviews=-1):
    
    """
    func :  scrap all results from a category search on trustpilor and return dict
            on their site, first results are companies_with_best_score and then they are all results
            in facts I don't see any difference between those results, so I add an arg scrap_best_score, if it sets True we scrap only firsts results,
            else we scrap only seconds results (if we scrap both of them we get only doublons)
            on the other way, the site shows 20+ results if we are not on the last page, that's why I count them with companies_count
            if companies_count < 20 then we are on the last page
            by the way, the site himself provides doublons, so if the link is already in our dic, we skip it
    
    params:
        category : str -> category_link to search
        location : str -> code postal or ville
        numberofreviews : int -> must be in [0, 25, 50, 100, 250, 500]
        status : str -> must be in ["all", "unclaimed", "claimed", ""]
        timeperiode : int -> must be in [0, 6, 12, 18]
        page : int -> load this page and higher
        verboses : bool -> print steps
        scrap_best_score : bool -> scrap best_score_companies or ALL (no difference between them, set this param to True it's faster)
        max_companies : int -> exit func when nb_companies >= max_companies
        max_reviews : int -> exit func when nb_reviews >= max_reviews
    """
    
    url = "https://fr.trustpilot.com/categories/" + category + "?"
    if location:
        url += "location=" + location + "&"
    url += "numberofreviews=" + str(numberofreviews) + "&"
    if status != "":
        url += "status=" + status + "&"
    url += "timeperiode=" + str(timeperiode) + "&"
    url += "page="
    
    companies = {"name": [], "link": [], "stars": [], "review_count": [], "location": [], "category": []}
    total_reviews = 0
    done = False
    while not done:
        
        req = requests.get(url + str(page))
        if verbose > 0:
            print(url + str(page))
        try:
            soup = BeautifulSoup(req.text, "lxml")
        except:
            soup = BeautifulSoup(req.text, "html.parser")
        
        company_count_on_page = 0
        
        # possibly 2 blocks of companies (most rated and all)
        div_containers = soup.find_all("div", class_="styles_businessUnitCardsContainer__1ggaO")

        for div_container in div_containers:
            # for each block
            div = div_container.find("div", class_="styles_categoryBusinessListWrapper__2H2X5")
            
            companies_with_best_score = True
            if div == None:
                # if we find div with 2H2X5 class it's companies_with_best_score so we need to enter in this div
                # else it's all we don't have to enter anywhere
                div = div_container
                companies_with_best_score = False
                
            a_balises = div.find_all("a")
            for a_balise in a_balises:
                
                # if not companies_with_best_score, we don't want to scrap, only to count them to know if we are on last page
                if scrap_best_score:
                    if not companies_with_best_score:
                        continue
                else:
                    if companies_with_best_score:
                        company_count_on_page += 1
                        continue
                
                # for each entreprise, find link, title, stars
                link = a_balise["href"].split("/")[-1]
                title = a_balise.find("div", class_="styles_businessTitle__1IANo").text
                
                # try to get more info if they have
                try:
                    infos_div = a_balise.find("div", class_="styles_textRating__19_fv").text
                    infos = infos_div.split("\xa0·\xa0")
                    infos[0] = int(infos[0].split()[0])
                    infos[1] = float(infos[1].split()[-1])
                except:
                    if verbose == 2:
                        print("No stars and nb_reviews for", title, link)
                    infos = [np.nan, np.nan]
                # try to get location  if they have
                try:
                    div_locations = a_balise.find("div", class_="styles_location__3JATO")
                    spans_location = div_locations.find_all("span", class_=None)
                    location = ""
                    if spans_location != None:
                        i = 0
                        while i < len(spans_location):
                            location += spans_location[i].text
                            if i == 0:
                                location += " . "
                            i += 1
                except:
                    if verbose == 2:
                        print("No location for", title, link)
                    location = np.nan

                # if this is a doublon we skip it
                if link in companies["link"]:
                    company_count_on_page += 1
                    continue
                    
                # add infos in dic
                companies["name"].append(title)
                companies["link"].append(link)
                companies["stars"].append(infos[1])
                companies["review_count"].append(infos[0])
                companies["location"].append(location)
                companies["category"].append(category)
                
                total_reviews += infos[0]
                

                company_count_on_page += 1

        if max_reviews > 0 and total_reviews >= max_reviews:
            return companies
        if max_companies > 0 and len(companies["name"]) >= max_companies:
            return companies
        if verbose > 0:   
            print(f"nb company on page {page} : {company_count_on_page}")
        if company_count_on_page >= 20:
            page += 1
            time.sleep(random.uniform(0.5, 1.5))
        else:
            done = True
    return companies

In [256]:
companies = scrap_companies_from_search_category("food_beverages_tobacco", verbose=1, max_reviews=-1)

https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=1
nb company on page 1 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=2
nb company on page 2 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=3
nb company on page 3 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=4
nb company on page 4 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=5
nb company on page 5 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=6
nb company on page 6 : 20
https://fr.trustpilot.com/categories/food_beverages_tobacco?numberofreviews=0&status=all&timeperiode=0&page=7
nb company on page 7 : 20
https://fr.trustpilot.com/categories/food_bevera

In [253]:
df = pd.DataFrame(data=companies)
display(df.head())
print(df.shape)
print(len(df["link"].unique()))
nb_dbl = df.shape[0] - len(df["link"].unique())
print(df["link"].value_counts().index[0:0+nb_dbl], df["link"].value_counts()[0:0+nb_dbl])

Unnamed: 0,name,link,stars,review_count,location,category
0,Le Petit Vapoteur,www.lepetitvapoteur.com,4.9,3149.0,,food_beverages_tobacco
1,Comptoir des Vignes,www.comptoirdesvignes.fr,4.9,2208.0,,food_beverages_tobacco
2,MonWhisky.fr,monwhisky.fr,4.9,849.0,4 rue Alfred Dreyfus . 87350PANAZOL,food_beverages_tobacco
3,Belleville Brulerie - Paris,cafesbelleville.com,4.9,819.0,14 Bis Rue Lally-Tollendal . 75019Paris,food_beverages_tobacco
4,Spiruline des îles d'or,spiruline-des-iles-dor.com,4.9,400.0,1143 Chemin de la Garde . 83400Hyères,food_beverages_tobacco


(462, 6)
462
Index([], dtype='object') Series([], Name: link, dtype: int64)


In [254]:
df[df["link"] == "fredbox.fr"]

Unnamed: 0,name,link,stars,review_count,location,category
159,Fredbox,fredbox.fr,4.4,16.0,Rue de l'escampadou . 13520Maussane-les-Alpilles,food_beverages_tobacco
