## TripAdvisor Recomender System

#### Data Collection from TripAdvisor

All the data are scraped using Selenium and BeautifulSoup from TripAdvisor website from late April to early May 2019. The hotels are from 16 popular European tourist destination cities and their TripAdvisor rating range from 3.0 to 5.0. The collected data are separated into 4 files. These files are:

1) tripadvisor_data.csv (hotel information from the hotel listings from the city section)

2) tripadvisor_hotel.txt (hotel information from the hotel details webpage in json format)

3) tripadvisor_reviewer.csv (brief reviewer information from the hotel details webpage)

4) tripadvisor_hotel_review.csv (reviews written by reviewer from the TripAdvisor reviewer webpage)

In [None]:
# import libraries
from selenium import webdriver
from bs4 import BeautifulSoup 
import urllib
import requests, re
from datetime import datetime
import time as t
from time import time
import os, sys
from lxml import html, etree
import argparse

import pandas as pd
import json

import numpy as np

### Scraping through TripAdvisor City Section for Hotel Listings

In [None]:

def process_hotel_list(locality, checkin_date, checkout_date, hotel_data, sort="popularity"):
    
    """
    
    locality is the city location of the hotel search
    
    checkin_date and checkout_date are the start and end date of hotel stay.
    It should be year/month/day string format, ie. '2019/07/01'
    
    sort is the method of how search results are displayed either by order of 'popularity', 'value', 'price' or 'distance'
    
    hotel_data is a list used to collect scraped hotel information
    
    """
    
    check_in = checkin_date
    check_out = checkout_date
    
    geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
    api_response  = requests.get(geo_url, verify=False).json()
    url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
    geo = api_response['results'][0]['value'] 
    date = check_in+"_"+check_out
    
    form_data = {'changeSet': 'TRAVEL_INFO',
                'showSnippets': 'false',
                'staydates':date,
                'uguests': '2',
                'sortOrder':sort
    
                }
    
    headers = {
                'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
                'Accept-Encoding': 'gzip,deflate',
                'Accept-Language': 'en-US,en;q=0.5',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive',
                'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
                'Host': 'www.tripadvisor.com',
                'Pragma': 'no-cache',
                'Referer': url_from_autocomplete,
                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
                'X-Requested-With': 'XMLHttpRequest'
                }
    
    cookies=  {"SetCurrency":"USD"}
    page_response  = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
    if page_response.status_code != 200:
        print('Page error for, '+ url_from_autocomplete +', is found.')
        return hotel_data
    else:
        parser = html.fromstring(page_response.text)
        hotel_data = iterate_hotels(parser, locality, hotel_data)
        return hotel_data
    
    
    
def iterate_hotels(parser, locality, hotel_data):
    
    """
    
    this function is called by process_hotel_list() in order to parse and scrape hotel information
    
    """
    
    next_page = parser.xpath('//a[contains(@class,"nav next taLnk ui_button primary")]/@href')
    next_page = 'http://www.tripadvisor.com' + next_page[0] if next_page else  None
    
    hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
    if not hotel_lists:
        hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')

    for index, hotel in enumerate(hotel_lists): # grab the element of each feature from html
        XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
        XPATH_REVIEWS  = './/a[@class="review_count"]//text()'
        XPATH_RANK = './/div[@class="popRanking"]//text()'
        XPATH_RATING = './/a[contains(@class,"ui_bubble_rating")]//@alt'
        XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
        XPATH_HOTEL_FEATURES = './/div[contains(@class,"common_hotel_icons_list")]//li//text()'
        XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
        XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()' 
        XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()'

        raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
        raw_no_of_deals =  hotel.xpath(XPATH_VIEW_DEALS)
        raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
        raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
        raw_rank = hotel.xpath(XPATH_RANK)
        raw_rating = hotel.xpath(XPATH_RATING)
        raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
        raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
        raw_hotel_price_per_night  = hotel.xpath(XPATH_HOTEL_PRICE)

        url = 'http://www.tripadvisor.com' + raw_hotel_link[0] if raw_hotel_link else  None
        reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0 
        rank = ''.join(raw_rank) if raw_rank else None
        rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
        name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
        hotel_features = ','.join(raw_hotel_features)
        price_per_night = ''.join(str(*raw_hotel_price_per_night)).replace('\n','') if raw_hotel_price_per_night else None
        no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
        booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None

        if no_of_deals:
            no_of_deals = no_of_deals[0]
        else:
            no_of_deals = 0

        data = {
                'hotel_name':name,
                'url':url,
                'locality':locality,
                'reviews':reviews,
                'tripadvisor_rating':rating,
                'hotel_features':hotel_features,
                'price_per_night':price_per_night,
                'no_of_deals':no_of_deals,
                'booking_provider':booking_provider

                }
        hotel_data.append(data)
        
        if next_page:
            t.sleep(1)
        try:
            page_response = requests.get(next_page)
        except requests.exceptions.ConnectionError:
            t.sleep(60)
            page_response = requests.get(next_page)
            parser = html.fromstring(page_response.text)
            hotel_data = iterate_hotels(parser, locality, hotel_data)
        else:
            parser = html.fromstring(page_response.text)
            hotel_data = iterate_hotels(parser, locality, hotel_data)
        finally:
            return hotel_data
        
    return hotel_data


In [None]:
hotel_data = []

In [None]:
process_hotel_list('London, England', '2019/07/01', '2019/07/15', hotel_data)

In [None]:
# write hotel_data to a csv file called tripadvisor_data.csv
# with open('tripadvisor_data.csv','wb')as csvfile:  # substitute 'wb' for 'ab' to append hotel_data to existing tripadvisor_data.csv
            fieldnames = ['hotel_name','url','locality','reviews','tripadvisor_rating','price_per_night','booking_provider','no_of_deals','hotel_features']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in hotel_data:
                writer.writerow(row)

In [None]:
tadata = pd.read_csv('tripadvisor_data.csv')
tadata.info()

In [None]:
tadata = tadata.drop_duplicates() # drop duplicate hotels from list
# tadata.to_csv("tripadvisor_data.csv", index=False, encoding='utf8')

### Scraping through Individual TripAdvisor Hotel Webpage for Hotel Details

In [None]:

def process_hotel_details(hotel_url, retry=0):
    
    """
    
    hotel_url is hotel webpage at TripAdvisor
    retry attempts to reconnect to TripAdvisor if it fails up to MAX_RETRY times
    
    """
    
    MAX_RETRY = 10
    RETRY = 0

    headers = {
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
                "cache-control": "max-age=0",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36",
                }

    response = requests.get(hotel_url, headers=headers)
    if response.status_code == 404:
        print('error: Page not found, status_code: 404')
        pass
    parser = html.fromstring(response.text, myurl)
    script_text = ' '.join(''.join(parser.xpath('//script//text()')).split())
    raw_json = re.findall("define\(\'@ta\/page\-manifest\'\,\[\]\,function\(\)\{return\s+({.*?});\}\);", script_text)
    try:
        json_loaded = json.loads(raw_json[0])
    except Exception as e:
        json_loaded = {}
        if RETRY < MAX_RETRY:
            RETRY = RETRY+1
            # Retrying the same URL
            process_hotel_details(hotel_url, RETRY)

    XPATH_NAME = '//h1[@id="HEADING"]//text()'
    XPATH_RANK = '//span[contains(@class,"popularity")]//text()'
    XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()'

    raw_name = parser.xpath(XPATH_NAME)
    raw_rank = parser.xpath(XPATH_RANK)
    raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON)
    name = clean(raw_name)
    rank = clean(raw_rank)
    if not name:
        if RETRY < MAX_RETRY:
            RETRY = RETRY+1
            # Retrying the same URL
            process_hotel_details(hotel_url, RETRY)

    hotel_rating = 0
    review_count = 0
    address = {}
    if raw_address_json:
        try:
            parsed_address_info = json.loads(raw_address_json[0])
            rating = parsed_address_info.get('aggregateRating', {})
            address = parsed_address_info.get("address", {})

            hotel_rating = rating.get('ratingValue')
            review_count = rating.get('reviewCount')

            address = {
                        'street_address': address.get('streetAddress'),
                        'region': address.get('addressRegion'),
                        'locality': address.get('addressLocality'),
                        'country': address.get("addressCountry", {}).get("name"),
                        'zipcode': address.get("postalCode")
                    }
        except Exception as e:
            review_count = hotel_rating = 0
            raise e

    ratings = {}
    elems = parser.find_class("hotels-review-list-parts-ReviewRatingFilter__row_num--gIW_f")
    if elems:
        ratings = {
            'Excellent': elems[0].text,
            'Good': elems[1].text,
            'Average': elems[2].text,
            'Poor': elems[3].text,
            'Terrible': elems[4].text
                }

    amenities = parser.find_class("hotels-hotel-review-about-with-photos-Amenity__name--2IUMR")
    amenity_list = []     
    for a in amenities:
        amenity_list.append(a.text)

    data = {
            'address': address,
            'ratings': ratings,
            'amenities': amenity_list,
            'rating': float(hotel_rating) if hotel_rating else 0.0,
            'review_count': int(review_count) if review_count else 0,
            'name': name,
            'rank': rank,
            'hotel_url': hotel_url
            }

    return data


def clean(text):
    
    """
    
    processing scraped information before adding to a json formatted list of dictionaries
    
    """
    
    if text:
        # Removing \n \r and \t
        return ' '.join(''.join(text).split()).strip()
    return None


In [None]:
hotel_details = []
for index, hotel in enumerate(tadata['url'][0:1000]):
    result = process_request(hotel)
    hotel_details.append(result)
    print(index)  # keeping track of progress in case of error
    t.sleep(1)

In [None]:
details = []
details.extend(hotel_details)

In [None]:
# saving details list as tripadvisor_hotel.txt in json format
# with open('tripadvisor_hotel.txt', 'w') as out_file:
    json.dump(details, out_file)

In [None]:
# converting tripadvisor_hotel.txt to pandas DataFrame
tahotel = pd.read_json('tripadvisor_hotel.txt')
tahotel = tahotel[tahotel.review_count >= 100]   # filtering hotels with more than 100 reviews 
tahotel.index = range(len(tahotel))

### Scraping through Individual TripAdvisor Hotel Webpage for Guest Review Information

In [None]:

def process_reviews(hotel_url, reviewer_list):
    
    response = requests.get(hotel_url)
    if response.status_code != 200:
        print('Error is found on page, '+ hotel_url)
        return reviewer_list
    else:
        soup = BeautifulSoup(response.text)
        reviews = soup.findAll(class_ = 'hotels-review-list-parts-SingleReview__reviewContainer--d54T4')
        review_count = 10
        reviewer_list = iterate_review_page(soup, reviews, review_count, reviewer_list)
        return reviewer_list
            
def iterate_review_page(soup, reviews, review_count, reviewer_list):           

    next_page = soup.find('a', class_="ui_button nav next primary ", href=True)
    next_page = 'http://www.tripadvisor.com'+next_page['href'] if next_page['href'] else  None
       
    for r in reviews:
        try:
            contribute = r.find('span', class_ = 'social-member-MemberHeaderStats__bold--3z3qh').get_text()
            contribute = int(contribute.replace(',', '')) 
        except AttributeError:
            contribute = 0    
            pass
        
        if contribute > 25 and review_count > 0: #filter reviewer with more than 25 reviews and a limit of 10 reviewers for each hotel
            try:
                reviewer = r.find('a', class_ = 'ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC').get_text()
            except AttributeError:
                reviewer = "NONE PROVIDED"
                
            reviewer_website = r.find('a', href=True)
            reviewer_website = 'http://www.tripadvisor.com'+reviewer_website['href']
            rating = r.find('span', class_="ui_bubble_rating")
            rating = str(rating).strip('"0></span>')[-1:]
            try:
                date =  r.find(class_ = 'hotels-review-list-parts-EventDate__event_date--CRXs4').find('span').get_text()
            except:
                date = ''
            
            if reviewer_website not in list(reviewer_list['user_link']): # avoid duplicate reviewer
                
                df = pd.DataFrame({'uid':reviewer, 'user_link':reviewer_website, 'rating':int(rating), 'date_of_stay':date[14:], 'num_of_reviews':int(contribute)}, 
                                  columns=['uid', 'user_link', 'rating', 'date_of_stay', 'num_of_reviews'], index=[0])
                reviewer_list = pd.concat([reviewer_list, df], axis =0)   
                review_count -= 1
            else:
                pass
    if next_page and review_count > 0:
        t.sleep(1)
        try:
            response = requests.get(next_page)
        except requests.exceptions.ConnectionError:
            t.sleep(60) # Retrying same url after 1 minute
            response = requests.get(next_page)
            soup = BeautifulSoup(response.text)
            reviews = soup.findAll(class_ = 'hotels-review-list-parts-SingleReview__reviewContainer--d54T4')
            reviewer_list = iterate_review_page(soup, reviews, review_count, reviewer_list)
        else:
            soup = BeautifulSoup(response.text)
            reviews = soup.findAll(class_ = 'hotels-review-list-parts-SingleReview__reviewContainer--d54T4')
            reviewer_list = iterate_review_page(soup, reviews, review_count, reviewer_list)
        finally:
            return reviewer_list

    return reviewer_list



In [None]:
for index, hotel in enumerate(tahotel[0:1000]['hotel_url']):
    hotel_url = hotel
    print(index) # keeping track of progress in case of error
    n = len(reviewer_list)
    hotel_df = process_reviews(hotel_url, reviewer_list)
    reviewer_list = pd.concat([reviewer_list, hotel_df[:][n:]], axis =0)
    t.sleep(1)

In [None]:
reviewer_list.to_csv('tripadvisor_reviewer.csv', index=False)

### Scraping through TripAdvisor Guest Reviewer Webpage for Hotel Reviews using Selenium

In [None]:

def process_other_reviews(user_url, hotel_review_list):
    
    driver.get(user_url)
    t.sleep(1.5)
    try: # check if driver get user_url page
        response = driver.page_source
        hotel_review_list = try_button(driver, response, user_url, hotel_review_list)   
    except: # retry getting user_url with Selenium
        driver.get(user_url)
        t.sleep(1.5)
        try:
            response = driver.page_source
        except Exception as e: # print reason for why it fails
            print(e)
        else: # check if 'show more' button is clickable
            hotel_review_list = try_button(driver, response, user_url, hotel_review_list)
            
    return hotel_review_list
 

def try_button(driver, response, user_url, hotel_review_list):   # check if 'show more' button is clickable
    
    try:
        python_button = driver.find_element_by_class_name("social-show-more-ShowMore__button_contents--1djai")
        python_button.click()
        soup = BeautifulSoup(response)
        hotel_review_list = iterate_reviewer_page(soup, user_url, hotel_review_list)
    except:
        soup = BeautifulSoup(response)
        hotel_review_list = iterate_reviewer_page(soup, user_url, hotel_review_list)
        
    return hotel_review_list

    
def iterate_reviewer_page(soup, user_url, hotel_review_list):    # filter out reviews that are not of hotels from the 16 european cities 
    
    reviews = soup.findAll('div', class_ = 'social-sections-CardSection__card_section--3Hc9Y ui_card section')
    for r in reviews:
        hotel = r.find('div', class_="social-sections-POICarousel__container--297jy social-sections-POICarousel__carousel--1vz03").find('a', href=True)
        if not hotel:
            continue
        try:
            hotel = hotel['href']    # grab website link info
        except:
            continue
        if "Hotel_Review" in hotel:    # check if it is a hotel review and from the 16 cities
            if 'London_England.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Paris_lle_de_France.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Rome_Lazio.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Barcelona_Catalonia.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Berlin.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Vienna.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Prague_Bohemia.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Budapest_Central_Hungary.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Athens_Attica.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Florence_Tuscany.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Milan_Lombardy.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Madrid.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Lisbon_District_Central_Portugal.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Edinburgh_Scotland.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Amsterdam_North_Holland_Province.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            elif 'Brussels.html' in hotel:
                hotel_review_list = scrape_reviews(r, user_url, hotel_review_list, hotel)
            else:
                pass
            
    return hotel_review_list


def scrape_reviews(r, user_url, hotel_review_list, hotel):
    
    reviewer = r.find('a', class_ = 'ui_link social-member-event-MemberEventOnObjectBlock__member--35-jC').get_text()
    try:
        title = r.find('div', class_="social-sections-ReviewSection__title--35ISZ social-sections-ReviewSection__linked--2rTun").get_text()
    except:
        title = ''
    try:
        preview = r.find('q', class_="social-sections-ReviewSection__quote--3gE7d").get_text()
    except:
        preview = ''
    rating = r.find('span', class_="ui_bubble_rating")
    rating = str(rating).strip('"0></span>')[-1:]
    hotel = 'http://www.tripadvisor.com' + hotel
    try:
        date =  r.find('div', class_ = 'social-review-info-EventDate__event_date--2d3vn').find('span').get_text()
    except:
        date = ''
    df = pd.DataFrame({'uid':reviewer, 'user_link':user_url, 'rating':float(rating), 'hotel_link':hotel, 'date_of_stay':date[14:], 'title':title, 'review_preview':preview}, 
                                  columns=['uid', 'user_link', 'rating', 'hotel_link', 'date_of_stay', 'title', 'review_preview'], index=[0])
    hotel_review_list = pd.concat([hotel_review_list, df])   
    hotel_review_list = hotel_review_list.reset_index(drop=True)
    return hotel_review_list


In [None]:
hotel_review_list = pd.DataFrame(columns=['uid', 'user_link', 'rating', 'hotel_link', 'date_of_stay', 'title', 'review_preview'])    

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--test-type")
driver = webdriver.Chrome(chrome_options=options)

In [None]:
for index, user_url in enumerate(reviewer_list.user_link[0:10000]):
    print(index)
    hotel_review_list = process_other_reviews(user_url, hotel_review_list)

### Preliminary Data Evaluation