In [45]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import random
import time

In [46]:
def scrap_companies_from_search_category(category, location=False, numberofreviews=0, status="all",
                                         timeperiode=0, page=1, verbose=0, scrap_best_score=True,
                                        max_companies=-1, max_reviews=-1):
    
    """
    func :  scrap all results from a category search on trustpilor and return dict
            on their site, first results are companies_with_best_score and then they are all results
            in facts I don't see any difference between those results, so I add an arg scrap_best_score, if it sets True we scrap only firsts results,
            else we scrap only seconds results (if we scrap both of them we get only doublons)
            on the other way, the site shows 20+ results if we are not on the last page, that's why I count them with companies_count
            if companies_count < 20 then we are on the last page
            by the way, the site himself provides doublons, so if the link is already in our dic, we skip it
    
    params:
        category : str -> category_link to search
        location : str -> code postal or ville
        numberofreviews : int -> must be in [0, 25, 50, 100, 250, 500]
        status : str -> must be in ["all", "unclaimed", "claimed", ""]
        timeperiode : int -> must be in [0, 6, 12, 18]
        page : int -> load this page and higher
        verboses : bool -> print steps
        scrap_best_score : bool -> scrap best_score_companies or ALL (no difference between them, set this param to True it's faster)
        max_companies : int -> exit func when nb_companies >= max_companies
        max_reviews : int -> exit func when nb_reviews >= max_reviews
    """
    
    url = "https://fr.trustpilot.com/categories/" + category + "?"
    if location:
        url += "location=" + location + "&"
    url += "numberofreviews=" + str(numberofreviews) + "&"
    if status != "":
        url += "status=" + status + "&"
    url += "timeperiode=" + str(timeperiode) + "&"
    url += "page="
    
    companies = {"name": [], "link": [], "stars": [], "review_count": [], "location": [], "category": []}
    total_reviews = 0
    done = False
    while not done:
        
        req = requests.get(url + str(page))
        if verbose > 0:
            print(url + str(page))
        try:
            soup = BeautifulSoup(req.text, "lxml")
        except:
            soup = BeautifulSoup(req.text, "html.parser")
        
        company_count_on_page = 0
        
        # possibly 2 blocks of companies (most rated and all)
        div_containers = soup.find_all("div", class_="styles_businessUnitCardsContainer__1ggaO")

        for div_container in div_containers:
            # for each block
            div = div_container.find("div", class_="styles_categoryBusinessListWrapper__2H2X5")
            
            companies_with_best_score = True
            if div == None:
                # if we find div with 2H2X5 class it's companies_with_best_score so we need to enter in this div
                # else it's all we don't have to enter anywhere
                div = div_container
                companies_with_best_score = False
                
            a_balises = div.find_all("a")
            for a_balise in a_balises:
                
                # if not companies_with_best_score, we don't want to scrap, only to count them to know if we are on last page
                if scrap_best_score:
                    if not companies_with_best_score:
                        continue
                else:
                    if companies_with_best_score:
                        company_count_on_page += 1
                        continue
                
                # for each entreprise, find link, title, stars
                link = a_balise["href"].split("/")[-1]
                title = a_balise.find("div", class_="styles_businessTitle__1IANo").text
                
                # try to get more info if they have
                try:
                    infos_div = a_balise.find("div", class_="styles_textRating__19_fv").text
                    infos = infos_div.split("\xa0·\xa0")
                    infos[0] = int(infos[0].split()[0])
                    infos[1] = float(infos[1].split()[-1])
                except:
                    if verbose == 2:
                        print("No stars and nb_reviews for", title, link)
                    infos = [np.nan, np.nan]
                # try to get location  if they have
                try:
                    div_locations = a_balise.find("div", class_="styles_location__3JATO")
                    spans_location = div_locations.find_all("span", class_=None)
                    location = ""
                    if spans_location != None:
                        i = 0
                        while i < len(spans_location):
                            location += spans_location[i].text
                            if i == 0:
                                location += " . "
                            i += 1
                except:
                    if verbose == 2:
                        print("No location for", title, link)
                    location = np.nan

                # if this is a doublon we skip it
                if link in companies["link"]:
                    company_count_on_page += 1
                    continue
                    
                # add infos in dic
                companies["name"].append(title)
                companies["link"].append(link)
                companies["stars"].append(infos[1])
                companies["review_count"].append(infos[0])
                companies["location"].append(location)
                companies["category"].append(category)
                
                total_reviews += infos[0]

                company_count_on_page += 1
                
                
        if max_reviews > 0 and total_reviews >= max_reviews:
            return companies
        if max_companies > 0 and len(companies["name"]) >= max_companies:
            return companies
        if verbose > 0:   
            print(f"nb company on page {page} : {company_count_on_page}")
        if company_count_on_page >= 20:
            page += 1
            time.sleep(random.uniform(0.5, 1.5))
        else:
            done = True
    return companies

In [47]:
def scrap_reviews_from_company(company, page=1, max_reviews=-1, verbose=0):
    url = "https://fr.trustpilot.com/review/" + company + "?page="
    
    reviews = {"consumer_name": [], "consumer_nb_review_writed": [], "stars": [],
               "title_review": [], "content_review": [], "date_experience": [], "company": []}
    
    done = False
    while not done:
        req = requests.get(url + str(page))
        if verbose > 0:
            print(url + str(page))
        try:
            soup = BeautifulSoup(req.text, "lxml")
        except:
            soup = BeautifulSoup(req.text, "html.parser")
        
        nb_review_on_page = 0
        review_list = soup.find("div", class_="review-list")
        review_cards = review_list.find_all("div", class_="review-card")
        
        for review_card in review_cards:
            
            #consumer_name
            consumer_name = " ".join(review_card.find("div", class_="consumer-information__name").text.split())
            
            #consumer_nb_review_writed
            consumer_nb_review_writed = review_card.find("div", class_="consumer-information__review-count").find("span").text.split()[0]
            
            #stars
            stars = review_card.find("div", class_="star-rating star-rating--medium").find("img")["alt"].split()[0]
            
            #title_review
            title_review = " ".join(review_card.find("h2", class_="review-content__title").find("a").text.split())
            
            #content_review
            try:
                content_review = " ".join(review_card.find("p", class_="review-content__text").text.split())
            except:
                content_review = np.nan
            
            #date_experience
            try:
                date_experience = " ".join(review_card.find("p", class_="review-content__dateOfExperience").text.split(":")[-1].split())
            except:
                date_experience = np.nan
                
            #add to dic
            reviews["consumer_name"].append(consumer_name)
            reviews["consumer_nb_review_writed"].append(int(consumer_nb_review_writed))
            reviews["stars"].append(int(stars))
            reviews["title_review"].append(title_review)
            reviews["content_review"].append(content_review)
            reviews["date_experience"].append(date_experience)
            reviews["company"].append(company)
            
            #count nb of reviews
            nb_review_on_page += 1
        
        if max_reviews > 0 and len(reviews["consumer_name"]) >= max_reviews:
                return reviews
        if verbose > 0:
            print(f"nb_review_on_page {page} : {nb_review_on_page}")
            
        if nb_review_on_page < 20:
            done = True
        else:
            page += 1
            time.sleep(random.uniform(0.5, 1.5))
    return reviews

In [56]:
def scrap_companies_and_reviews_from_search_category(category, location=False, numberofreviews=0, status="all",
                                         timeperiode=0, max_companies=-1, max_reviews_by_companies=-1, verbose=0):
    
    #scrap companies
    companies = scrap_companies_from_search_category(category, location=location, numberofreviews=numberofreviews, status=status,
                                                    timeperiode=timeperiode, max_companies=max_companies)
    if verbose > 0:
        print("scrap companies DONE")
    
    #scrap reviews
    reviews = {"consumer_name": [], "consumer_nb_review_writed": [], "stars": [],
               "title_review": [], "content_review": [], "date_experience": [], "company": []}
    for company_link in companies["link"]:
        #get reviews of each company
        new_reviews = scrap_reviews_from_company(company_link, max_reviews=max_reviews_by_companies)
        if verbose > 0:
            print(f"scrap reviews of {company_link} DONE")
            
        #join those reviews whith reviews of all companies
        for key, value in reviews.items():
            reviews[key] = reviews[key] + new_reviews[key]
        
    return companies, reviews

In [57]:
df_categories = pd.read_csv("csv/categories.csv")

In [61]:
print(df_categories["link"][0])
companies, reviews = scrap_companies_and_reviews_from_search_category(df_categories["link"][0], max_companies=60,
                                                                      max_reviews_by_companies=1000, verbose=1)

food_beverages_tobacco
scrap companies DONE
scrap reviews of www.lepetitvapoteur.com DONE
scrap reviews of www.comptoirdesvignes.fr DONE
scrap reviews of monwhisky.fr DONE
scrap reviews of cafesbelleville.com DONE
scrap reviews of spiruline-des-iles-dor.com DONE
scrap reviews of demainlesvins.com DONE
scrap reviews of au-droit-de-bouchon.com DONE
scrap reviews of stephconti.fr DONE
scrap reviews of alcoool.fr DONE
scrap reviews of entrepotitalien.fr DONE
scrap reviews of www.rhumattitude.com DONE
scrap reviews of vert-tiges.com DONE
scrap reviews of lafourche.fr DONE
scrap reviews of www.oliquide.com DONE
scrap reviews of nutri-naturel.com DONE
scrap reviews of lepetitballon.com DONE
scrap reviews of cafe-en-grain.com DONE
scrap reviews of vintageandco.com DONE
scrap reviews of boutique.lushan.fr DONE
scrap reviews of www.peche-maison.fr DONE
scrap reviews of www.vincentdanslesvapes.fr DONE
scrap reviews of cbdtoulouse.fr DONE
scrap reviews of oeforgood.com DONE
scrap reviews of lomi.c

In [62]:
df_companies = pd.DataFrame(companies)
print(df_companies.shape)
display(df_companies.head())

(40, 6)


Unnamed: 0,name,link,stars,review_count,location,category
0,Le Petit Vapoteur,www.lepetitvapoteur.com,4.9,3149,,food_beverages_tobacco
1,Comptoir des Vignes,www.comptoirdesvignes.fr,4.9,2208,,food_beverages_tobacco
2,MonWhisky.fr,monwhisky.fr,4.9,849,4 rue Alfred Dreyfus . 87350PANAZOL,food_beverages_tobacco
3,Belleville Brulerie - Paris,cafesbelleville.com,4.9,819,14 Bis Rue Lally-Tollendal . 75019Paris,food_beverages_tobacco
4,Spiruline des îles d'or,spiruline-des-iles-dor.com,4.9,400,1143 Chemin de la Garde . 83400Hyères,food_beverages_tobacco


In [63]:
df_reviews = pd.DataFrame(reviews)
print(df_reviews.shape)
display(df_reviews.head())
display(df_reviews.tail())

(19408, 7)


Unnamed: 0,consumer_name,consumer_nb_review_writed,stars,title_review,content_review,date_experience,company
0,Helrat,4,5,Toujours parfait,"Toujours parfait. Depuis 4 ans, je commande to...",14 mars 2021,www.lepetitvapoteur.com
1,Laura Perrault,1,5,Site aussi bon que d'habitude ou l'on…,Site aussi bon que d'habitude ou l'on trouve p...,8 mars 2021,www.lepetitvapoteur.com
2,HENRY KARINE,2,5,Super!,"Présentation agréable, conseils rapides et ada...",11 mars 2021,www.lepetitvapoteur.com
3,le chineur,2,5,Bonne presentation et explication des…,Bonne presentation et explication des produits...,10 mars 2021,www.lepetitvapoteur.com
4,Antoine Thomas,1,5,Site génial pour commander son matériel…,Site génial pour commander son matériel et ses...,10 mars 2021,www.lepetitvapoteur.com


Unnamed: 0,consumer_name,consumer_nb_review_writed,stars,title_review,content_review,date_experience,company
19403,M.B,1,5,Cigarette électronique zlide,Excellent rapport qualité/prix. Livraison rapi...,,ecigplanete.com
19404,Patricia GAUJOUX,1,5,Je reçois toujours mes colis dans des…,Je reçois toujours mes colis dans des délais r...,,ecigplanete.com
19405,Marlène,1,5,Société compétitive et très réactive …,Société compétitive et très réactive dans les ...,,ecigplanete.com
19406,Florence Sevault,1,5,Livraison rapide conforme à la commande…,Livraison rapide conforme à la commande emball...,,ecigplanete.com
19407,Mademoiselle Virginie Pernet,2,1,"Erreur et problème, en continu","Deux commandes, deux problèmesPrix attractifs ...",,ecigplanete.com


In [64]:
df_companies.to_csv("scraped_companies.csv", index=False)
df_reviews.to_csv("scraped_reviews.csv", index=False)