<h2>Web Scrapping no Trip Advisor</h2>

    Na mesma pasta, deve-se ter um txt com todos os restaurantes que se quer tirar os reviews. Se o txt tiver n reviews, o script vai gerar n+1 arquivos csv, sendo o n-ésimo csv o "CSV_FINAL.csv", onde se tem todos os reviews equalizados e binários. Por "binários" entenda-se que há uma coluna que, se a nota for maior ou igual a 4, terá o valor 1 e, se a nota for estritamente menor a quatro, terá o valor 0. Por "equalizado" entenda-se que há 50% dos reviews com a label 1 e 50% com a label 0. 

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd 
import textblob

In [3]:
def display(content, filename='output.html'):
    
    with open(filename, 'wb') as f:
        f.write(content)
        webbrowser.open(filename)
        
def get_soup(session, url, show=False):
    
    r = session.get(url)
    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200:  # not OK
        print('[get_soup] status code:', r.status_code)
    else:
        
        return BeautifulSoup(r.text, 'html.parser')
    
    
def post_soup(session, url, params, show=False):
    
    #lê html e devolve soup
    r = session.post(url, data=params)

    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200:  # not OK
        print('[post_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')


def scrape(url, lang='ALL'):
    
    # create session to keep all cookies (etc.) between requests
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
    })

    items = parse(session, url + '?filterLang=' + lang)

    return items

def parse(session, url):
    
    #Pega  o numero de reviews e gera subpaginas com os reviews
    
    print('[parse] url:', url)

    soup = get_soup(session, url)

    if not soup:
        print('[parse] no soup:', url)
        return

    num_reviews = soup.find('span', class_='reviews_header_count').text  # get text
    num_reviews = num_reviews[1:-1]
    num_reviews = num_reviews.replace(',', '')
    num_reviews = float(num_reviews)  # convert text into integer
    print('[parse] num_reviews ALL:', num_reviews)

    url_template = url.replace('.html', '-or{}.html')
    print('[parse] url_template:', url_template)

    items = []

    offset = 10

    while (True):
        subpage_url = url_template.format(offset)

        subpage_items = parse_reviews(session, subpage_url)
        if not subpage_items:
            break

        items += subpage_items

        if len(subpage_items) < 5:
            break

        offset += 5

    return items

def get_reviews_ids(soup):
    items = soup.find_all('div', attrs={'data-reviewid': True})

    if items:
        reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
        print('[get_reviews_ids] data-reviewid:', reviews_ids)
        return reviews_ids
    
def get_more(session, reviews_ids):
    url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'

    payload = {
        'reviews': ','.join(reviews_ids),  
        'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX',  
        'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
        'haveCsses': 'apg-Hotel_Review-in',
        'Action': 'install',
    }
    soup = post_soup(session, url, payload)
    return soup


def parse_reviews(session, url):
    '''Get all reviews from one page'''

    print('[parse_reviews] url:', url)

    soup = get_soup(session, url)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

    reviews_ids = get_reviews_ids(soup)
    if not reviews_ids:
        return

    soup = get_more(session, reviews_ids)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

    items = []

    for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
        
        bubble_rating = review.select_one('span.ui_bubble_rating')['class']
        bubble_rating = bubble_rating[1].split('_')[-1]

        item = {
            'review_rate': bubble_rating,
            'review_body': review.find('p', class_='partial_entry').text,
            #'review_date': review.find('span', class_='ratingDate')['title'],  # 'ratingDate' instead of 'relativeDate'
        }

        items.append(item)
        print('\n--- review ---\n')
        for key, val in item.items():
            print(' ', key, ':', val)

    print()

    return items


def write_in_csv(items, filename='results.csv',
                 headers=['hotel name', 'review title', 'review body',
                          'review date', 'contributions', 'helpful vote',
                          'user name', 'user location', 'rating'],
                 mode='w'):
    print('--- CSV ---')

    with io.open(filename, mode, encoding="utf-8") as csvfile:
        csv_file = csv.DictWriter(csvfile, headers)

        if mode == 'w':
            csv_file.writeheader()

        csv_file.writerows(items)

def read(file):
    f = open(file, "r")
    for link in f :
        if str(link) not in start_urls:
            start_urls.append(link)
        else:
            pass


In [4]:
DB_COLUMN = 'review_rate'
DB_COLUMN1 = 'review_body'

start_urls = []
read('restaurantes_get.txt')

lang = 'pt-br'

headers = [
    DB_COLUMN,
    DB_COLUMN1,  
]

for url in start_urls:

    # get all reviews for 'url' and 'lang'
    items = scrape(url, lang)

    if not items:
        print('No reviews')
    else:
        # write in CSV
        filename = url.split('Reviews-')[1][:-5] + '__' + lang
        print('filename:', filename)
        write_in_csv(items, filename + '.csv', headers, mode='w')


In [None]:
def ler_csv(file, PASTA):
    print("Lendo ", file)
    if file.endswith(".csv"):
        DIRECTORIO = os.path.join(PASTA, file)
        data = pd.read_csv(DIRECTORIO)
        NEW_DF = pd.DataFrame(data)
        return NEW_DF
    else:
        pass 


In [None]:
PRIMARY_DF = pd.DataFrame().rename_axis("Id", index=True)
PASTA = "mix_dataset"
i = 0
for file in os.listdir(PASTA):
    NEW_DF = ler_csv(file, PASTA)
    PRIMARY_DF = PRIMARY_DF.append(NEW_DF, ignore_index=True)
    print(PRIMARY_DF.shape)
    i += 1
print("O algoritmo tem ", i, "restaurantes." )


In [None]:
def DATA_BINARIZADOR_RESULTADOS(data):
    RESULTADO_BINARIO = []

    for item in data['review_rate']:
        if item < 40:
            RESULTADO_BINARIO.append(0)
        elif item >= 40:
            RESULTADO_BINARIO.append(1)
        else:
            print("Deu feijoada aqui hein")

    data['RESULTADO_BINARIO'] = RESULTADO_BINARIO
    return data

In [None]:
PRIMARY_DF = DATA_BINARIZADOR_RESULTADOS(PRIMARY_DF)
PRIMARY_DF.head()

In [None]:
PRIMARY_DF.to_csv('CSV_FINAL.csv')

In [None]:
data = pd.read_csv('CSV_FINAL.csv')

In [None]:
data = data.drop("Unnamed: 0", axis=1)
data.head()


In [None]:

numeros = []
while data['RESULTADO_BINARIO'].value_counts()[1] != data['RESULTADO_BINARIO'].value_counts()[0]:
 
    x = np.random.randint(1, len(data['RESULTADO_BINARIO']))
    
    if x not in numeros:
        if data['RESULTADO_BINARIO'][x] == 1:
            data.drop(x, axis=0, inplace=True)
            numeros.append(x)
            
        else:
            pass
    else:
        pass
    data = data.reset_index(drop=True)


In [None]:
data.to_csv('CSV_FINAL_EQUALIZADO.csv')