# Imports

In [1]:
import requests # http requests
import re # regular expressions
from bs4 import BeautifulSoup # xml parsing
import pandas as pd
import regex as re
import os

# Functions for web-scraping

In [2]:
def check_response(url):
    response = requests.get(url)
    status_code = response.status_code
    if status_code != 200:
        raise Exception(f'Error {status_code} ({requests.status_codes._codes[status_code]})')
    else:
        html_text = response.content
        print(f'Request successful')
        return html_text

def make_soup(html_text):
    return BeautifulSoup(html_text, 'html.parser')

def get_contents(soup):
    entries = []
    keys = ['datetime', 'name', 'rating', 'title', 'review', 'event_time']
    reviews = soup.find_all('div', class_ = 'styles_reviewCardInner__EwDq2')
    
    for review in reviews:
        try:
            text = review.find('p', class_ = 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn').get_text()
        except AttributeError:
            continue
        
        date = (review.find('time')).attrs['datetime']
        name = review.find('span', class_ = 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17').get_text()
        rating = re.search('(\d+)(?=\s*.svg)', str(review.find('div', class_ =  'star-rating_starRating__4rrcf star-rating_medium__iN6Ty')))[0]
        title = review.find('h2', class_ = 'typography_heading-s__f7029 typography_appearance-default__AAY17').get_text()
        event = review.find('p', class_ = 'typography_body-m__xgxZ_ typography_appearance-default__AAY17').get_text()[20:]
        
        values = [date, name, rating, title, text, event]
        d = dict(zip(keys, values))
        entries.append(d)

    return entries

def make_df(df, d):
    df_new = pd.DataFrame.from_dict(d)
    return pd.concat([df, df_new], ignore_index=True, axis = 0) 


# Execute scrape. Change *n* to set desired number of reviews.

In [3]:
n = 100

df = pd.DataFrame()

for i in range(1,n):
    if len(df)>=n:
        print(f'Scrape complete! \n {i} review pages have been scanned, total reviews is {len(df)}.')
        break
    url = 'https://dk.trustpilot.com/review/puregym.dk?page={}&sort=recency'.format(i)
    html = check_response(url)
    soup = make_soup(html)
    d = get_contents(soup)
    df = make_df(df, d)


Request successful
Request successful
Request successful
Request successful
Request successful
Request successful
Scrape complete! 
 7 review pages have been scanned, total reviews is 109.


# Preview df

In [11]:
df.head()

Unnamed: 0,datetime,name,rating,title,review,event_time
0,2023-10-09T14:20:36.000Z,Arne Ziebell Olsen,5,Dette gælder KUN for Gentofte/Lyngbyvej…,Dette gælder KUN for Gentofte/Lyngbyvej og Osl...,09. oktober 2023
1,2023-10-09T12:00:03.000Z,Anette,5,Stort udvalg af maskiner og…,Stort udvalg af maskiner og holdtræning. Rene ...,09. oktober 2023
2,2023-10-07T21:45:24.000Z,Laura Lacziko,4,"Stemningen er god, men læg vægten tilbage!","Stemningen er god, og stedet er stort nok og h...",07. oktober 2023
3,2023-10-10T12:07:43.000Z,Hanne Andersen,5,Føler mig velkommen,Føler mig velkommen. Dygtige instruktører.Men ...,10. oktober 2023
4,2023-10-10T11:27:37.000Z,Henriette Ini Bing,3,Jeg elsker mit Zumba hold hos Ditte på…,Jeg elsker mit Zumba hold hos Ditte på Bibliot...,02. oktober 2023


# Save the df as CSV in current working directory

In [5]:
cwd = os.getcwd()
path = cwd + "/TrustPilot_reviews.csv"
df.to_csv(path, encoding='utf-16')