In [20]:
import requests
import pandas as pd
import time
from IPython.display import clear_output

In [21]:
#Epoch & Unix Timestamp Conversion to GMT+0 time
#Cuz Tiktok API return Epoch & Unix Timestamp, we need to convert it to GMT+0 time
def epoch_to_date(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(epoch))

#If you want to convert it to local time, use this function
# def epoch_to_date(epoch):
#     return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

#--------------------------------------------
#Check URL
def check_url(url):
    if 'tiktok.com' in url and 'video' in url:
        return True
    else:
        return False

#--------------------------------------------
#get ID from Tiktok URL
def get_id(url):
    url = '/'.join(url.replace('?','/').split('/')[:6])
    id = url.split('/')[-1]
    print('ID: ', id)
    print('URL: ', url)
    return (url, id)

#--------------------------------------------
#Load DataFrame
def load_df(fileName='Tiktok_comments.csv'):
    #check if file exist
    try:
        df = pd.read_csv('Tiktok_comments.csv')
        print('File exist')
    except:
        #Create DataFrame to store comments
        df = pd.DataFrame(columns=['cid', 'text', 'time', 'digg_count', 'reply_comment_total', 'user_id', 'user_nickname', 'user_unique_id', 'referer'])
        print('File not exist. Will create new file to save DataFrame')
    df.info()
    #---------------------------------
    #cid: comment id
    #text: comment
    #time: comment time
    #digg_count: number of likes
    #reply_comment_total: number of replies
    #user_id: user id
    #user_nickname: user nickname
    #user_unique_id: user unique id
    #referer: video url
    return df

In [26]:
#Cuz TikTok API only return {count} comments per request, so we need to loop to get all comments
#set cursor - address of the first row
#set count - number of rows to read - maximum 50 rows per request

#Get comments from Tiktok API function
def get_comments(referer, cursor=0, count=50):
    clear_output(wait=True)
    #get video id
    referer, id = get_id(referer)
    print('Please check if the URL is correct. If not, please re-run the cell')

    #load DataFrame
    df = load_df()

    #check if the video has been crawled
    if referer in df['referer'].values:
        clear_output(wait=True)
        print('This video has been crawled before')
        time.sleep(3)        
        return df

    time.sleep(3)
    input('Press Enter to continue...')
    
    #set total - total number of comments
    f = 0

    headers = {
        'authority': 'www.tiktok.com',
        'accept': '*/*',
        'accept-language': 'en,vi;q=0.9',    
        'referer': referer,
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27',
    }

    while True:
        try:
            response = requests.get(f'https://www.tiktok.com/api/comment/list/?aweme_id={id}&count={count}&cursor={cursor}', headers=headers)
            json_data = response.json()
            cursor = json_data['cursor']
            for data in json_data['comments']:
                #append data to dataframe
                df = df.append({'cid':data['cid'], 'text':data['text'], 'time':epoch_to_date(data['create_time']), 'digg_count':data['digg_count'], 'reply_comment_total':data['reply_comment_total'], 'user_id':data['user']['uid'], 'user_nickname':data['user']['nickname'], 'user_unique_id':data['user']['unique_id'], 'referer':referer}, ignore_index=True)
                #print progress
                clear_output(wait=True)
                print(f'Progress: {f}/{json_data["total"]}')
                f += 1
        #print error
        except Exception as e:
            print(e)
            print(f"Error at {f}th request")
            break
    return df

In [28]:
if __name__ == '__main__':
    stop = False
    while not stop:
        #input video url
        while True:
            referer = input("Enter video url: ")
            if check_url(referer):
                break
            else:
                print('Invalid url. Please try again')

        #test url
        #referer = 'https://www.tiktok.com/@toanchodien23/video/7121230509027478810'

        #Dataframe description
        df = get_comments(referer)
        df.info()
        df.head()
        time.sleep(1)

        #want to save
        save = input('Do you want to save the DataFrame? (y/n): ')
        clear_output(wait=True)
        if save.lower() == 'y':
            df.to_csv('Tiktok_comments.csv', index=False)            
            print('DataFrame saved')
        else:
            print('DataFrame not saved')
        stop = input('Do you want to continue? (y/n): ').lower() != 'y'
    print('Done')
    

DataFrame not saved
Done


In [13]:
#save to csv
df.to_csv('TikTok_comments.csv', index=False)