In [10]:
import requests
import pandas as pd
import time
from IPython.display import clear_output

In [11]:
#Epoch & Unix Timestamp Conversion to GMT+0 time
#Cuz Tiktok API return Epoch & Unix Timestamp, we need to convert it to GMT+0 time
def epoch_to_date(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(epoch))

#If you want to convert it to local time, use this function
# def epoch_to_date(epoch):
#     return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

#--------------------------------------------
#get ID from Tiktok URL
def get_id(url):
    id = url.split('/')[-1].split('-')[-1]
    print('ID: ', id)
    print('URL: ', url)
    return id

#--------------------------------------------
#Load DataFrame
def load_df(fileName='Traveloka_comments.csv'):
    #check if file exist
    try:
        df = pd.read_csv(f'{fileName}')
        print('File exist')
    except:
        #Create DataFrame to store comments
        df = pd.DataFrame(columns=['profileId', 'reviewerName', 'reviewText', 'timestamp', 'overallScore', 'referer', 'hotelId', 'location'])
        print('File not exist. Will create new file to save DataFrame')
    df.info()
    #---------------------------------
    #profileId: ID of reviewer
    #reviewerName: Name of reviewer
    #reviewText: Review text
    #timestamp: Timestamp of review
    #overallScore: Rating of review
    #referer: URL of hotel
    #hotelId: ID of hotel
    #location: Location of hotel
    return df

In [12]:
#Cuz Traveloka API only return {top} comments per request, so we need to loop to get all comments
#set skip - address of the first row
#set top - number of rows to read - maximum 50 rows per request

#Get comments from Tiktok API function
def get_comments(referer, location, skip=0, top=50):
    clear_output(wait=True)
    #get video id
    id = get_id(referer)
    print('Please check if the URL is correct. If not, please re-run the cell')

    #load DataFrame
    df = load_df()

    #check if the video has been crawled
    if referer in df['referer'].values:
        clear_output(wait=True)
        print('This video has been crawled before')
        time.sleep(3)
        return df

    time.sleep(3)
    input('Press Enter to continue...')
    
    #set total - total number of comments
    f = 0

    cookies = {
        'tv-repeat-visit': 'true',
        '_gcl_au': '1.1.1017910713.1661435395',
        'isPriceFinderActive': 'null',
        'dateIndicator': 'null',
        'bannerMessage': 'null',
        'displayPrice': 'null',
        'G_ENABLED_IDPS': 'google',
        'tvs': 'qgdHX7GvehrD9XH5a3S4PWL3Nd74xArIuT+JzcRMbKddQHovERAJ9HWRLrAaZ0jPhWj5HSxm0ZKiRbldET1ham2PeYg1sQr2h/wIBjIyPQ1JQfOnq9PrXiJXCb7pG+GuuP17PrNVFFjuwKMBeIKyuK+0493pXsMgoPKzowqh5FHmfLa6OXRzK2k6wXi6oejz3jW7f6f85zK7XA1xLrLbn3wpMY91AYFzJ6h8za/vSrng40uUoDT+qJIv0oQGNB1A',
        'accomSuccessLoginConfirmation': '0',
        '_gac_UA-29776811-12': '1.1661935095.Cj0KCQjwjbyYBhCdARIsAArC6LKzmyRRvXAMs3lLTy2PFEFZHdQ4n0b5k1dCA4dOXbdkN2BvAG6s-C0aAiRxEALw_wcB',
        'hotelSearchLoginModalLastShown': '1661935102026',
        'AWSALB': 'Yn8Nyumwyk6usTiVIO4JNLwZA718+IBkWJWrDuaT4biZxbFhNU+VGtCektX4usv4Qkg803D0DHCq78fHuwTIl8GuC0l/a0eht4jCx9nej9SwsXArLyzHZPyFuuoN',
        'AWSALBCORS': 'Yn8Nyumwyk6usTiVIO4JNLwZA718+IBkWJWrDuaT4biZxbFhNU+VGtCektX4usv4Qkg803D0DHCq78fHuwTIl8GuC0l/a0eht4jCx9nej9SwsXArLyzHZPyFuuoN',
        'g_state': '{"i_p":1662643153466,"i_l":3}',
        '_gid': 'GA1.2.1415715167.1662516990',
        '_ga_RSRSMMBH0X': 'GS1.1.1662516990.18.1.1662517018.32.0.0',
        '_ga': 'GA1.2.1204082391.1661435395',
        'amp_1a5adb': '70q-mk3SbWBtM5kQ2GjtSF...1gcaro3vq.1gcarp0lb.4o.0.4o',
        'cto_bundle': '96btg19GZ3lGRUw3NElrQ1NNT0I4RUhidkZPUTU2alphdjJWTmVvV2ZVVUsxWHJ6MUNVb3lCc2NsblJ6S0QlMkJUS24lMkZubkhCck9CR2hNVld0RkJNVHJEOFhNa2U4RW5hNlg0Z0prUUpmdVVvcWxteTFqT2dZTjlJRGJtdnZKN2o5UE9Oc242MCUyRlR1OEJLOU4xVVBRc3NpTEpQR0ElM0QlM0Q',
        'tvl': 'qgdHX7GvehrD9XH5a3S4PdE8AYpuF3hYPaT5bxhY7ZZYNvDSDpXBMUsp5GdrScLsDPg+mEcMWJMs178u4tDsH7rjhAUzrfJvdipehLzltUaDoSLSsuckjZeqKsWweENOE9hK03kFgdqMgioOjtezhPmz/Peism2eMCojsihyNOrvlyHfFnPptZUxAgMVwRNSCMYWUJplNNMY2P4/83O9X+8GNrPf8Ng75ZieUaJama8=',
    }

    while True:
        try:            
            headers = {
                'authority': 'www.traveloka.com',
                'accept': '*/*',
                'accept-language': 'en,vi;q=0.9',
                'origin': 'https://www.traveloka.com',
                'referer': f'{referer}',
                'sec-ch-ua': '"Microsoft Edge";v="105", " Not;A Brand";v="99", "Chromium";v="105"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"Windows"',
                'sec-fetch-dest': 'empty',
                'sec-fetch-mode': 'cors',
                'sec-fetch-site': 'same-origin',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27',
                'x-domain': 'accomContent',
                'x-route-prefix': 'vi-vn',
            }
            json_data = {
                'fields': [],
                'data': {
                    'filterSortSpec': {
                        'travelTheme': None,
                        'travelType': None,
                        'sortType': 'LANGUAGE',
                        'tagIds': [],
                    },
                    'ascending': True,
                    'reviewLanguage': 'VIETNAMESE',
                    'hotelId': id,
                    'skip': skip, #same with cursor, the number of reviews to skip
                    'top': top, #max 50 reviews per request
                },
                'clientInterface': 'desktop',
            }
            response = requests.post('https://www.traveloka.com/api/v2/hotel/getHotelReviews', cookies=cookies, headers=headers, json=json_data)
            json_response = response.json()
            skip += top
            for data in json_response['data']['reviewList']:
                #append data to dataframe
                df = df.append({'profileId':data['profileId'], 'reviewerName':data['reviewerName'], 'reviewText':data['reviewText'], 'timestamp':epoch_to_date(data['timestamp']), 'overallScore':data['overallScore'], 'referer':referer, 'hotelId':data['hotelId'], 'location':location}, ignore_index=True)
                    #profileId: ID of reviewer
                    #reviewerName: Name of reviewer
                    #reviewText: Review text
                    #timestamp: Timestamp of review
                    #overallScore: Rating of review
                    #referer: URL of hotel
                    #hotelId: ID of hotel
                    #location: Location of hotel
                #print progress
                clear_output(wait=True)
                print(f'Progress: {f}')
                f += 1
        #print error
        except Exception as e:
            print(e)
            print(f"Error at {f}th request")
            break
    return df

In [14]:
if __name__ == '__main__':
    #load Hotel region Dataframe
    dfHotel = pd.read_csv('Traveloka_AllRegion.csv')
    for record in dfHotel.itertuples():
        #get hotel url
        referer = 'https://www.traveloka.com' + record.link
        #get hotel location
        location = record.city

        stop = False
        while not stop:
            #test url
            #referer = 'https://www.traveloka.com/vi-vn/hotel/vietnam/dalat-palace-heritage-hotel-1000000119097'

            #Dataframe description
            df = get_comments(referer, location)
            df.info()
            df.head()
            time.sleep(1)

            #want to save
            save = input('Do you want to save the DataFrame? (y/n): ')
            clear_output(wait=True)
            if save.lower() == 'y':
                df.to_csv('Traveloka_comments.csv', index=False)            
                print('DataFrame saved')
            else:
                print('DataFrame not saved')
            stop = input('Do you want to continue? (y/n): ').lower() != 'y'
        print('Done')
        if stop:
            break

DataFrame not saved
Done
