# Data Sources

**Content**
1. TripAdvisor Web Scraping
2. Yelp API
3. Google Reviews

## 1. TripAdvisor Web Scraping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import time
from pprint import pprint

In [None]:
driver = webdriver.Chrome("/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/chromedriver")

In [None]:
def scrape_review(name):
    username, city, country, contribution, title, month, year, review, ratings = [],[],[],[],[],[],[],[],[]
    for i in range(1,26):
        review_chuncks = driver.find_element(By.CLASS_NAME, 'LbPSX')
        
        chunck_list = review_chuncks.text.split('\nThis review is the subjective opinion of a Tripadvisor member and not of Tripadvisor LLC. Tripadvisor performs checks on reviews.\n')
        chunck_list.pop(-1)

        for chunck in chunck_list:
            try:
                review_element = chunck.split('\n')

                # element 1: username
                username.append(review_element[0].strip())
                
                # element 2,3,4:  city, country, contribution
                mix_string = review_element[1]
                pattern = r'([\w\s]+),\s*([\w\s]+)\s*(\d+)\s*contributions'
                match = re.match(pattern, mix_string)
                if match:
                    city.append(match.group(1))
                    country.append(match.group(2))
                    contribution.append(match.group(3))
                else:
                    city.append('None')
                    country.append('None')
                    contribution.append(review_element[1].replace('contributions','').strip())
                
                # element 5: title
                title.append(review_element[3].strip())
                
                # element 6,7: month, year
                element4 = review_element[4].split(' • ')[0]
                if len(element4.split()) >= 2:
                    month.append(element4.split()[0])
                    year.append(element4.split()[1])
                else:
                    month.append('None')
                    year.append('None')
                
                # element 8: review
                review.append(review_element[5].strip())
            
            except IndexError:
                continue
        
        # element 9: rating
        rating_elements = review_chuncks.find_elements(By.CLASS_NAME, 'UctUV.d.H0')
        for i in range(len(rating_elements)):
            try:
                rating_string = rating_elements[i].get_attribute("aria-label")
                rating = rating_string.split()[0]
                ratings.append(rating)
            except IndexError:
                continue
                
        driver.find_element(By.XPATH, '//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/div[11]/div[1]/div/div[1]/div[2]/div/a').click()
        time.sleep(3)

    attraction = [name]*len(username)
    review_dict = {'attraction':attraction,
                'username':username,
                'city': city,
                'country': country,
                'contribution': contribution,
                'title': title,
                'month': month,
                'year': year,
                'review': review,
                'rating': ratings}
    review = pd.DataFrame(review_dict)
    return review

In [None]:
place = pd.read_csv('SF_places.csv')
place_rating = []
all_reviews = pd.DataFrame(columns=['attraction', 'username', 'city', 'country', 
                           'contribution', 'title', 'month', 'year', 'review', 'rating'])
for i in range(place.shape[0]): #place.shape[0]
    try:
        url = place['url'][i]
        driver.get(url)
        driver.implicitly_wait(5)
        driver.execute_script("window.scrollBy(0, 2000);")
        time.sleep(10)

        language_button = '//*[@id="tab-data-qa-reviews-0"]/div/div[1]/div/div/div[2]/div/div/div[2]/div/div/div/button'
        all_language_button = '//*[@id="menu-item-all"]'
        driver.find_element(By.XPATH, language_button).click()
        time.sleep(5)
        driver.find_element(By.XPATH, all_language_button).click()
        time.sleep(5)

        place_rating.append(driver.find_element(By.CLASS_NAME, 'biGQs._P.fiohW.hzzSG.uuBRH').text)
        
        name = place['name'][i]
        reviews = scrape_review(name=name)
        
        all_reviews = pd.concat([all_reviews, reviews])
    
    except Exception as e:
        print(f"An error occurred while processing row {i}: {e}")
        continue


In [None]:
all_reviews.head()

## 2. Yelp API
- Document API: https://docs.developer.yelp.com/docs/fusion-intro
- We get two dataframes: businesses around SF; reviews to all businesses

In [None]:
import pandas as pd
import time
import requests
import json

In [None]:
# can only get n_review <= 3, n_business <= 50, review text with certain pieces

def get_yelp(key, loc = "San Francisco, CA", t_sleep = 0.1, n_business = 50, n_review = 3):
    headers = {"Authorization": "Bearer %s" % key}
    url = "https://api.yelp.com/v3/businesses/search"
    params={"limit": n_business, "location": loc}
    response = requests.get(url, headers=headers, params=params) #The API does not return businesses without any reviews
    businesses = response.json()["businesses"]
    businesses_pd = pd.DataFrame([business for business in businesses])
    appended_data = [None] * n_business
    i = 0
    for business in businesses:
        url2 = "https://api.yelp.com/v3/businesses/" + business['id'] + '/reviews'
        params2={"limit": n_review, "sort_by": "newest"}
        time.sleep(t_sleep)
        response2 = requests.get(url2, headers=headers, params=params2)
        reviews = response2.json()["reviews"]
        reviews_pd = pd.DataFrame([review for review in reviews])
        reviews_pd['business_id'] = business['id']
        appended_data[i] = reviews_pd
        i += 1
    return [businesses_pd, pd.concat(appended_data, ignore_index=True)]

In [None]:
api_key = "scWgvjtjmz1UMUb9LD1q6C8qDEZkOrNjv6ZVrf9jFU4GurLk9QlA8CC3-Ac1GWEAUEvG7weRAOp-Uo1ay-kMtOPLsM7UFlY4FDlpurtYwrVPNen-j9WMsjHw7o4ZZHYx"
businesses, reviews = get_yelp(key = api_key)

In [None]:
businesses.head(5)

In [None]:
reviews.head(5)

## 3. Google Reviews 

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
place = pd.read_csv('SF_places.csv')
csv_name = !ls new_google_review
csv_name

In [None]:
def load_data(filename):
    data = pd.read_csv(f'/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/new_google_review/{filename}')
    
    # Remove the prefix before the underscore
    filename = filename.split('_', 1)[1]

    # Remove the suffix after the dot
    filename = filename.rsplit('.', 1)[0]

    # Replace underscores with spaces
    filename = filename.replace('_', ' ')
    data['attraction'] = [filename]*(data.shape[0])
    return data

In [None]:
data = load_data('google_Adventure_Playground.csv')
for i in range(1,len(csv_name)): #place.shape[0]
    filename = csv_name[i]
    #name = place['name'][i]
    reviews = load_data(filename=filename)
    data = pd.merge(data, reviews, how='outer')

In [None]:
data.shape

In [None]:
data.head(5)

In [None]:
data = data.rename(columns={'d4r55': 'username', 
                            'RfnDt 2': 'contributions',
                            'rsqaWe': 'time',
                            'wiI7pd': 'review',
                            'kyuRq 2': 'language'
                            })

In [None]:
data.info()

In [None]:
data['hCCjke src'] = [1]*data.shape[0]
    
data['hCCjke src 2'] = data['hCCjke src 2'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 3'] = data['hCCjke src 3'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 4'] = data['hCCjke src 4'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 5'] = data['hCCjke src 5'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})

data['rating'] = data.apply(lambda row: row['hCCjke src'] + row['hCCjke src 2'] + row['hCCjke src 3'] + row['hCCjke src 4']+ row['hCCjke src 5'], axis=1)

data['language'] = data['language'].replace(np.nan, 'English')
data['contributions'] = [str(i).replace(' reviews','').strip() for i in data['contributions']]
data['contributions'] = [i.replace('· ','').strip() for i in data['contributions']]

In [None]:
def convert_date(date_str):
    if isinstance(date_str, str):
        if date_str.endswith('ago'):
            try:
                num = int(date_str.split()[0])
            except:
                num = 1
            if 'day' in date_str:
                return (datetime.today() - timedelta(days=num)).strftime('%b %Y')
            elif 'week' in date_str:
                return (datetime.today() - timedelta(weeks=num)).strftime('%b %Y')
            elif 'month' in date_str:
                return (datetime.today() - timedelta(days=num*30)).strftime('%b %Y')
            elif 'year' in date_str:
                return (datetime.today() - timedelta(days=num*365)).strftime('%b %Y')
    return np.nan

data['time'] = [convert_date(i) for i in data['time']]

In [None]:
keep_columns = ['attraction','username','contributions', 'time','review','rating'] #,'language'

# Keep only the specified columns
google_reviews = data.loc[:, keep_columns]

In [None]:
google_reviews.info()

In [None]:
google_reviews.dropna(subset=['review'], inplace=True)
google_reviews.head(5)