In [11]:
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
URLS_PATH = 'YOUR FILE WITH TIKTOK URLS HERE'
URL_COLUMN = 'COLUMN THAT CONTAINS URLS'

In [174]:
urls = pd.read_excel(URLS_PATH)

In [175]:
urls = urls[URL_COLUMN].to_list()

# Service functions and parameters

In [30]:
commentContainerXPath = '//div[contains(@class, "DivCommentContainer")]'
commentsDivXPath = '//div[contains(@class, "DivCommentListContainer")]'
allCommentsXPath = '//div[contains(@class, "DivCommentContentContainer")]'
level2CommentsXPath = '//div[contains(@class, "DivReplyContainer")]'
commentSkeleton = '//div[contains(@class, "DivCommentItemSkeletonContainer")]'

publisherProfileUrlXPath = '//span[contains(@class, "SpanUniqueId")]'
nicknameAndTimePublishedAgoXPath = '//span[contains(@class, "SpanOtherInfos")]'

# we will filter these later because we have to handle them differently depending on what layout we have
likesCommentsSharesXPath = "//strong[contains(@class, 'StrongText')]"

postUrlXPath = '//div[contains(@class, "CopyLinkText")]'
descriptionXPath = '//h4[contains(@class, "H4Link")]/preceding-sibling::div'

# we need "View" or else this catches "Hide" too (I added '–°–º–æ—Ç—Ä–µ—Ç—å' and '–ü–æ–¥—Ä–æ–±–Ω–µ–µ' for Russian interface)
viewMoreDivXPath = '//p[contains(@class, "PReplyAction") and (contains(., "–°–º–æ—Ç—Ä–µ—Ç—å") or contains(., "View") or contains(., "–ü–æ–¥—Ä–æ–±–Ω–µ–µ"))]'
loadingMore = 'svg[class*="SvgContainer"]'

In [150]:
def get_stats(drv):
    # retrieve the number of likes, comments and shares from the video
    like_count = drv.find_element(By.XPATH, "//strong[@data-e2e='like-count']").text
    comment_count = drv.find_element(By.XPATH, "//strong[@data-e2e='comment-count']").text
    share_count = drv.find_element(By.XPATH, "//strong[@data-e2e='share-count']").text
    return {'post_like_count': like_count, "post_comment_count": comment_count, "post_share_count": share_count}

def get_all_comments(drv, allCommentsXPath):
    # access all available comments under the publication
    return drv.find_elements(By.XPATH, allCommentsXPath)

def get_comment_data(comment):
    '''
    retrieves features of each comment and commenting users
    returns a dictionary of all collected data
    '''
    comment_id = comment.get_attribute('id')
    nickname = comment.find_element(By.XPATH, "./div[1]/a").text
    username = comment.find_element(By.XPATH, "./a").get_attribute('href').split('?')[0].split('/')[3]
    comment_text = comment.find_element(By.XPATH, "./div[1]/p").text
    comment_date = comment.find_element(By.XPATH, "./div[1]/p[2]/span").text
    comment_like_count = comment.find_element(By.XPATH, "./div[1]/p[2]/div/span").text
    try:
        comment_pic = comment.find_element(By.XPATH, "./a/span/img").get_attribute('src')
    except:
        comment_pic = None

    # Checking if a comment is a reply
    parent = comment.find_element(By.XPATH, './..')
    parent_class = parent.get_attribute('class')
    is_reply = 'DivReplyContainer' in parent_class
    reply_to = parent.find_element(
        By.XPATH, './../div[contains(@class, "DivCommentContentContainer")]').get_attribute('id') if is_reply else None

    return {
        "comment_nickname": nickname,
        "comment_username": username,
        "comment_date": comment_date,
        "comment_text": comment_text,
        "comment_like_count": comment_like_count,
        "comment_pic": comment_pic,
        "comment_id": comment_id,
        "reply_to": reply_to
    }

def load_comments(drv, url):
    drv.get(url)
    try:
        WebDriverWait(drv, 10).until(EC.presence_of_element_located((By.XPATH, allCommentsXPath)))
    except:
        pass
    

    # Loading top-level comments
    comments_initial = get_all_comments(drv, allCommentsXPath)
    while len(comments_initial) > 0:
        drv.execute_script("arguments[0].scrollIntoView(true);", comments_initial[-1])
        try:
            # Waiting for comment skeleton to appear and disappear as a stable marker of comments loading
            WebDriverWait(drv, 2).until(EC.presence_of_element_located((By.XPATH, commentSkeleton)))
            WebDriverWait(drv, 10).until_not(EC.presence_of_element_located((By.XPATH, commentSkeleton)))
            comments = get_all_comments(drv, allCommentsXPath)
            if len(comments_initial) == len(comments):
                break
            comments_initial = comments
        except:
            break
        finally:
            pass
            
    
    # Loading replies
    while len(comments_initial) > 0:
        view_more_buttons = drv.find_elements(By.XPATH, viewMoreDivXPath)
        view_more_present = len(view_more_buttons) > 0
        if not view_more_present:
            break

        for btn in view_more_buttons:
            drv.execute_script("arguments[0].scrollIntoView(true);arguments[0].click()", btn)
        try:
            WebDriverWait(drv, 10).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, loadingMore)))
        except:
            pass
    
    stats = get_stats(drv)
    comments = get_all_comments(drv, allCommentsXPath)
    comments_data = []
    for comment in tqdm.tqdm(comments):
        comment_data = drv.execute_script(get_comment_data_script, comment)
        # comment_data = get_comment_data(comment) - this line calls in the function get_comment data described above
        # however, in the current version of this notebook this is replaceable with running JS code from the cell below; it is called by the preceding line
        comment_data.update(stats)
        comments_data.append(comment_data)
    
    c = pd.DataFrame(comments_data)
    c['url'] = [url]*len(c)
    return c

In [5]:
#read JS code that collects descriptive data of comments and commenting users
with open('commentData.js', 'rt') as file:
    get_comment_data_script = file.read()

# Loading comments

In [247]:
drv = Chrome()
drv.get('https://tiktok.com')

At this point it is best to log in so that you don't run into CAPTCHA as often

In [255]:
results = pd.DataFrame()
results = results.append(comments, 
                         ignore_index=True, 
                         verify_integrity=True, 
                         sort=True)

In [248]:
for url in urls:
    comments = load_comments(drv, url)
    results = results.append(comments, ignore_index=True, verify_integrity=True, sort=True)

0it [00:00, ?it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 271/271 [00:00<00:00, 386.93it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 388.98it/s]
0it [00:00, ?it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 193.05it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:00<00:00, 316.09it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 267.37it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 413.06it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 492.01it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 367.15it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 312.16it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 206.02it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 284.40it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 285.73it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 346.38it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00

In [266]:
results.drop_duplicates(inplace = True)
results.to_parquet(SAVE_PATH)

Unnamed: 0,comment_date,comment_id,comment_like_count,comment_nickname,comment_pic,comment_text,comment_username,post_comment_count,post_like_count,post_share_count,reply_to,url
0,2021-8-13,6995880398484161282,4,,,"–ü—Ä–µ–∫—Ä–∞—Å–Ω–∞—è –æ–ø–µ—Ä–∞—Ç–æ—Ä—Å–∫–∞—è —Ä–∞–±–æ—Ç–∞! –°—Ä–∞–∑—É –≤–∏–¥–Ω–æ, —á...",,2,40,7,,https://www.tiktok.com/@genthik/video/69958115...
1,2021-8-13,6995897860026090241,3,¬∑ –ê–≤—Ç–æ—Ä,,–•–∞–∞—Öü•∞,,2,40,7,6.995897860026089e+18,https://www.tiktok.com/@genthik/video/69958115...
2,2021-8-13,6995880398484161282,4,MaksonO4KA,https://p16-sign-sg.tiktokcdn.com/aweme/100x10...,"–ü—Ä–µ–∫—Ä–∞—Å–Ω–∞—è –æ–ø–µ—Ä–∞—Ç–æ—Ä—Å–∫–∞—è —Ä–∞–±–æ—Ç–∞! –°—Ä–∞–∑—É –≤–∏–¥–Ω–æ, —á...",maksono4ka,2,40,7,,https://www.tiktok.com/@genthik/video/69958115...
3,2021-8-13,6995897860026090241,3,gent4hik ¬∑ –ê–≤—Ç–æ—Ä,https://p16-sign-sg.tiktokcdn.com/aweme/100x10...,–•–∞–∞—Öü•∞,genthik,2,40,7,6.995897860026089e+18,https://www.tiktok.com/@genthik/video/69958115...
4,2022-1-29,7058559841375191810,1,GermanInGame ¬∑ –ê–≤—Ç–æ—Ä,https://p16-sign-va.tiktokcdn.com/musically-ma...,"–§—É–ª–ª –≤–∏–¥–µ–æ –Ω–∞ —é—Ç—É–± –∫–∞–Ω–∞–ª–µ, –Ω–∞–∑–≤–∞–Ω–∏–µ –≤–∏–¥–µ–æ GTA ...",germaningame,4,510,10,,https://www.tiktok.com/@germaningame/video/683...


In [398]:
results.head()

Unnamed: 0,index,comment_date,comment_id,comment_like_count,comment_nickname,text,comment_username,post_comments,post_likes,post_shares,reply_to,url
0,0,2021-8-13,6995880398484161282,4,,"–ü—Ä–µ–∫—Ä–∞—Å–Ω–∞—è –æ–ø–µ—Ä–∞—Ç–æ—Ä—Å–∫–∞—è —Ä–∞–±–æ—Ç–∞! –°—Ä–∞–∑—É –≤–∏–¥–Ω–æ, —á...",,2,40,7,,https://www.tiktok.com/@genthik/video/69958115...
1,1,2021-8-13,6995897860026090241,3,¬∑ –ê–≤—Ç–æ—Ä,–•–∞–∞—Öü•∞,,2,40,7,6.995897860026089e+18,https://www.tiktok.com/@genthik/video/69958115...
2,2,2021-8-13,6995880398484161282,4,MaksonO4KA,"–ü—Ä–µ–∫—Ä–∞—Å–Ω–∞—è –æ–ø–µ—Ä–∞—Ç–æ—Ä—Å–∫–∞—è —Ä–∞–±–æ—Ç–∞! –°—Ä–∞–∑—É –≤–∏–¥–Ω–æ, —á...",maksono4ka,2,40,7,,https://www.tiktok.com/@genthik/video/69958115...
3,3,2021-8-13,6995897860026090241,3,gent4hik ¬∑ –ê–≤—Ç–æ—Ä,–•–∞–∞—Öü•∞,genthik,2,40,7,6.995897860026089e+18,https://www.tiktok.com/@genthik/video/69958115...
4,4,2022-1-29,7058559841375191810,1,GermanInGame ¬∑ –ê–≤—Ç–æ—Ä,"–§—É–ª–ª –≤–∏–¥–µ–æ –Ω–∞ —é—Ç—É–± –∫–∞–Ω–∞–ª–µ, –Ω–∞–∑–≤–∞–Ω–∏–µ –≤–∏–¥–µ–æ GTA ...",germaningame,4,510,10,,https://www.tiktok.com/@germaningame/video/683...
