In [99]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import random
import time

In [96]:
def scrap_reviews_from_company(company, page=1, max_reviews=-1, verbose=0):
    url = "https://fr.trustpilot.com/review/" + company + "?page="
    
    reviews = {"consumer_name": [], "consumer_nb_review_writed": [], "stars": [],
               "title_review": [], "content_review": [], "date_experience": [], "company": []}
    
    done = False
    while not done:
        req = requests.get(url + str(page))
        if verbose > 0:
            print(url + str(page))
        try:
            soup = BeautifulSoup(req.text, "lxml")
        except:
            soup = BeautifulSoup(req.text, "html.parser")
        
        nb_review_on_page = 0
        review_list = soup.find("div", class_="review-list")
        review_cards = review_list.find_all("div", class_="review-card")
        
        for review_card in review_cards:
            
            #consumer_name
            consumer_name = " ".join(review_card.find("div", class_="consumer-information__name").text.split())
            
            #consumer_nb_review_writed
            consumer_nb_review_writed = review_card.find("div", class_="consumer-information__review-count").find("span").text.split()[0]
            
            #stars
            stars = review_card.find("div", class_="star-rating star-rating--medium").find("img")["alt"].split()[0]
            
            #title_review
            title_review = " ".join(review_card.find("h2", class_="review-content__title").find("a").text.split())
            
            #content_review
            try:
                content_review = " ".join(review_card.find("p", class_="review-content__text").text.split())
            except:
                content_review = np.nan
            
            #date_experience
            try:
                date_experience = " ".join(review_card.find("p", class_="review-content__dateOfExperience").text.split(":")[-1].split())
            except:
                date_experience = np.nan
                
            #add to dic
            reviews["consumer_name"].append(consumer_name)
            reviews["consumer_nb_review_writed"].append(int(consumer_nb_review_writed))
            reviews["stars"].append(int(stars))
            reviews["title_review"].append(title_review)
            reviews["content_review"].append(content_review)
            reviews["date_experience"].append(date_experience)
            reviews["company"].append(company)
            
            #count nb of reviews
            nb_review_on_page += 1
        
        if max_reviews > 0 and len(reviews["consumer_name"]) >= max_reviews:
                return reviews
            
        if verbose > 0:
            print(f"nb_review_on_page {page} : {nb_review_on_page}")
            
        if nb_review_on_page < 20:
            done = True
        else:
            page += 1
            time.sleep(random.uniform(0.5, 1.5))
    return reviews

In [97]:
reviews = scrap_reviews_from_company("www.comptoirdesvignes.fr", max_reviews=2)
df = pd.DataFrame(data=reviews)
print(df.shape)
df.head()

(2, 7)


Unnamed: 0,consumer_name,consumer_nb_review_writed,stars,title_review,content_review,date_experience,company
0,Frédo,5,5,Très bon choix de vins et de…,Très bon choix de vins et de whiskys..bon acce...,14 mars 2021,www.comptoirdesvignes.fr
1,COUTIER Tamara,1,5,Très bon accueil,"Très bon accueil, bon conseil et choix important",13 mars 2021,www.comptoirdesvignes.fr


In [98]:
df.tail()

Unnamed: 0,consumer_name,consumer_nb_review_writed,stars,title_review,content_review,date_experience,company
0,Frédo,5,5,Très bon choix de vins et de…,Très bon choix de vins et de whiskys..bon acce...,14 mars 2021,www.comptoirdesvignes.fr
1,COUTIER Tamara,1,5,Très bon accueil,"Très bon accueil, bon conseil et choix important",13 mars 2021,www.comptoirdesvignes.fr
