# Scrape reviews

In [2]:
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import datetime
import os
import json
import time

## Scrape home page

In [3]:
url_rebtel = 'https://www.trustpilot.com/review/www.rebtel.com'
url_rebtel_page = url_rebtel + '?page='

In [4]:
with urllib.request.urlopen(url_rebtel) as url:
    homepage = url.read()

In [5]:
soup = BeautifulSoup(homepage, 'html.parser')
# print(soup.prettify())

In [6]:
# Explore content of home page
# list(soup.children)
# soup.head
print (soup.prettify()[0:200])

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Rebtel Reviews | Customer Service Reviews of Rebtel | www.rebtel.com
  </title>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   win


In [8]:
# Parse content/reviews of home page
reviews = soup.find_all('div', class_='review-stack')
container = soup.find_all('div', class_='reviews-container')
len(reviews)

20

In [9]:
# Parse total number of review pages
pagination = soup.find_all('a', class_='pagination-page')
page_num = [page['data-page-number'] for page in pagination]
page_num = [int(page) for page in page_num if page != 'next-page']
page_max = max(page_num)
page_max

112

## Scrape all pages

In [10]:
data = []  # reviews is a list of dictionaries

# Iterate over webpages
for page in range(1, page_max + 1):
    
    # If first page, take url of homepage; otherwise, url + page
    if (page == 1):
        url_complete = url_rebtel
    else:
        url_complete = url_rebtel_page + str(page)
    
    # Get content of webpage
    with urllib.request.urlopen(url_complete) as url:
            page_content = url.read()        
    soup = BeautifulSoup(page_content, 'html.parser')
    reviews = soup.find_all('div', class_='review-stack')
    
    # Iterate over reviews in page
    for i in range(0, len(reviews)):  
        # Parse user id
        user_id = reviews[i].find('a', class_='user-review-name-link')['href'].strip()
        user_id = user_id.replace('/users/', '')
        
        # Parse user name
        user_name = reviews[i].find('a', class_='user-review-name-link').get_text().strip()
        
        # Parse review title
        review_title = reviews[i].find('a', class_='review-title-link')
        review_title = review_title.get_text().strip()
        
        # Parse review content
        review_text = reviews[i].find('div', class_='review-body').get_text().strip()
        
        # Parse review answer
        review_answer = reviews[i].find('div', class_='comment')
        if (review_answer is not None):
            review_answer = review_answer.get_text().strip()
        else:
            review_answer = None
        
        # Parse review stars
        review_stars = int(reviews[i].find('div', class_='social-share-network social-share-network--twitter')['data-status'].split()[-5])
        
        # Parse review date
        review_date = reviews[i].find('time', class_='ndate')['datetime'].strip()
        review_date = review_date.replace('.000+00:00', '')
        review_date = review_date.replace('T', ' ')
        
        # Parse whether review was verified by Rebtel
        review_verify_text = reviews[i].find('div', class_='review-verified')
        if (review_verify_text is not None):
            review_verify = 'Verified order' in review_verify_text.get_text()
        else:
            review_verify = False        
        
        # Store data
        data.append({  
            'user_id': user_id,
            'user': user_name,
            'title': review_title,
            'text': review_text,
            'answer': review_answer,
            'stars': review_stars,
            'date': review_date,
            'verify': review_verify
        })
    

## Save data

In [12]:
today = time.strftime('%Y%m%d')
file_name = '../data/rawdata_' + today + '.json'

In [13]:
with open(file_name, 'w') as outfile:  
    json.dump(data, outfile, indent = 2)