# TikTok Crawler

based on https://github.com/AndersGiovanni/tiktok-research-client

original scripts are here: C:\Users\b1084631\Anaconda3\envs\tiktok\Lib\site-packages\tiktok_research_client

In [None]:
import pandas as pd

In [21]:
import datetime
import logging
import os
from typing import Any
from typing import Dict
from typing import List
from typing import Union

import requests
from dotenv import load_dotenv
from requests.models import Response
from tenacity import before_log
from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_fixed

from tiktok_research_client.utils import generate_date_ranges


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()


class TiktokClient:
    """TikTok API client."""

    def __init__(self) -> None:
        """Initialize TikTok API client."""
        self.access_token: Union[str, None] = None
        self.path_credentials = 'tiktok_credentials_sebastian_schmidt.txt'

    def read_credentials(self):
        credentials = {}
        with open(self.path_credentials, 'r') as file:
            for line in file:
                key, value = line.strip().split(': ')
                credentials[key] = value
        
        # Accessing the credentials
        self.CLIENT_KEY = credentials['CLIENT_KEY']
        self.CLIENT_SECRET = credentials['CLIENT_SECRET']
        self.GRANT_TYPE = credentials['GRANT_TYPE']

    def _get_headers(self) -> Dict[str, str]:
        """Get headers for TikTok API."""
        access_token: Dict[str, str] = self.get_access_token()

        return {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {access_token['access_token']}",
        }

    def get_access_token(self) -> Dict[str, str]:
        """Get access token from TikTok API.

        The access token is valid for 7200 seconds.

        Returns:
            Dict[str, str]: Access token and its expiration time.
        """
        
        self.read_credentials()

        headers = {
            "Content-Type": "application/x-www-form-urlencoded",
            "Cache-Control": "no-cache",
        }

        payload = {
            "client_key": self.CLIENT_KEY,
            "client_secret": self.CLIENT_SECRET,
            "grant_type": self.GRANT_TYPE,
        }

        response: Response = requests.post(
            "https://open.tiktokapis.com/v2/oauth/token/",
            headers=headers,
            data=payload,
            timeout=30,
        )


        # Check if the response is successful
        response.raise_for_status()

        return response.json()  # type: ignore

    def query(self, query: Dict[Any, Any], url: str) -> Union[Dict[str, Any], None]:
        """Query TikTok API.

        Args:
            query (Dict[Any, Any]): Custom query. Follow the documentation from https://developers.tiktok.com/doc/research-api-specs-query-videos/ # noqa
            url (str): TikTok API url.

        Returns:
            Response: Response from TikTok API.
        """

        return self.fetch_data(url, query)

    def search(
        self, keywords: List[str], start_date: str, max_size: int
    ) -> List[Dict[str, str]]:
        """Search for videos from TikTok API.

        Args:
            keywords (List[str]): Keywords to search for.
            start_date (str): Start date.
            max_size (int): Max number of videos to collect.

        Returns:
            List[Dict[str, str]]: List of videos.
        """
        url: str = "https://open.tiktokapis.com/v2/research/video/query/?fields=id,region_code,like_count,username,video_description,music_id,comment_count,share_count,view_count,effect_ids,hashtag_names,playlist_id,voice_to_text,create_time"  # noqa

        date_ranges: List[tuple[str, str]] = generate_date_ranges(start_date, 100)

        query: Dict[str, Any] = {
            "query": {
                "or": [
                    {
                        "operation": "IN",
                        "field_name": "keyword",
                        "field_values": keywords,
                    },
                    {
                        "operation": "IN",
                        "field_name": "hashtag_name",
                        "field_values": keywords,
                    },
                ]
            },
            "max_count": 100,
        }

        videos: List[Dict[str, str]] = list()

        for date_range in date_ranges:
            query["start_date"] = date_range[0]

            query["end_date"] = date_range[1]

            # Check if we have reached the max size
            if len(videos) >= max_size:
                break

            # Keep querying until there is no more data
            output = self._cursor_iterator(url, query, max_size=max_size)

            if output is not None:
                videos.extend(self._cursor_iterator(url, query, max_size=max_size))

        logging.info(f"Collected {len(videos)} videos.")

        logging.debug("Decoding timestamps...")
        for idx, video in enumerate(videos):
            videos[idx]["create_time"] = datetime.datetime.utcfromtimestamp(
                video["create_time"]  # type: ignore
            ).strftime("%Y-%m-%d")

        return videos

    def get_user(self, username: str) -> Union[Dict[str, Any], None]:
        """Get user data and videos from TikTok API.

        Args:
            username (str): TikTok username.

        Returns:
            Dict[str, str]: User info.
        """
        url: str = "https://open.tiktokapis.com/v2/research/user/info/?fields=display_name,bio_description,avatar_url,is_verified,follower_count,following_count,likes_count,video_count"  # noqa

        query: Dict[str, str] = {
            "username": username,
        }

        data: Union[Dict[str, Any], None] = self.fetch_data(url, query)

        data["videos"] = self._get_user_videos(username)  # type: ignore

        return data

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_fixed(2),
        before=before_log(logger, logging.DEBUG),
    )
    def fetch_data(
        self, url: str, query: Dict[str, Any]
    ) -> Union[Dict[str, Any], None]:
        """Query TikTok API.

        Args:
            url (str): TikTok API url.
            query (Dict[Any, Any]): Custom query. Follow the documentation from https://developers.tiktok.com/doc/research-api-specs-query-videos/

        Returns:
            response (Dict[str, str]): Response from TikTok API.
        """
        headers: Dict[str, str] = self._get_headers()

        try:
            response: Response = requests.post(
                url,
                headers=headers,
                json=query,
                timeout=30,
            )
            response.raise_for_status()  # This will raise a HTTPError for bad responses (4xx and 5xx)
        except requests.RequestException as e:
            logging.error("An error occurred: %s", e)
            return None  # or however you want to handle failures

        return response.json()  # type: ignore

    def get_comments(self, video_id: str) -> List[Dict[str, str]]:
        """Get comments from TikTok API.

        Args:
            video_id (str): TikTok video id.

        Returns:
            List[Dict[str, str]]: List of comments.
        """
        url: str = "https://open.tiktokapis.com/v2/research/video/comment/list/?fields=id,like_count,create_time,text,video_id,parent_comment_id,reply_count"

        query: Dict[str, Any] = {
            "video_id": video_id,
            "max_count": 100,
        }

        comments: List[Dict[str, str]] = list()

        has_more_data: bool = True

        while (
            has_more_data and len(comments) < 1000
        ):  # 1000 is the max number of comments we can get
            response: Union[Dict[str, Any], None] = self.fetch_data(url, query)

            comments.extend(response["data"]["comments"])  # type: ignore

            has_more_data = response["data"]["has_more"]  # type: ignore

            query["cursor"] = response["data"]["cursor"]  # type: ignore

            # Check if we have reached the max size or there is no more data
            if not has_more_data:
                del query["cursor"]
                break

        logging.debug("Decoding timestamps...")
        for idx, comment in enumerate(comments):
            comments[idx]["create_time"] = datetime.datetime.utcfromtimestamp(
                comment["create_time"]  # type: ignore
            ).strftime("%Y-%m-%d")

        return comments

    def _get_user_videos(
        self,
        username: str,
        start_date: str = "2023-01-01",
        max_size: int = 1000,
    ) -> List[Dict[str, Any]]:
        """Get user videos from TikTok API.

        Args:
            username (str): TikTok username.
            start_date (str, optional): Start date. Defaults to "2023-01-01".
            max_size (int, optional): Max number of videos to collect. Defaults to 1000.

        Returns:
            List[Dict[str, str]]: User videos.
        """
        url: str = "https://open.tiktokapis.com/v2/research/video/query/?fields=id,region_code,like_count,username,video_description,music_id,comment_count,share_count,view_count,effect_ids,hashtag_names,playlist_id,voice_to_text,create_time"  # noqa

        date_ranges: List[tuple[str, str]] = generate_date_ranges(start_date, 1000)

        query: Dict[str, Any] = {
            "query": {
                "and": [
                    {
                        "operation": "EQ",
                        "field_name": "username",
                        "field_values": [username],
                    },
                ]
            },
            "max_count": 100,
        }

        videos: List[Dict[str, str]] = list()

        for start_date, end_date in date_ranges:
            query["start_date"] = start_date

            query["end_date"] = end_date

            videos.extend(self._cursor_iterator(url, query, max_size))

        logging.debug("Decoding timestamps...")
        for idx, video in enumerate(videos):
            videos[idx]["create_time"] = datetime.datetime.utcfromtimestamp(
                video["create_time"]  # type: ignore
            ).strftime("%Y-%m-%d")

        return videos

    def _cursor_iterator(
        self, url: str, query: Dict[str, Any], max_size: int = 1000
    ) -> List[Dict[str, str]]:
        """Cursor iterator.

        Args:
            url (str): TikTok API url.
            query (Dict[str, Any]): Custom query. Follow the documentation from https://developers.tiktok.com/doc/research-api-specs-query-videos/
            max_size (int, optional): Max number of videos to collect. Defaults to 1000.

        Returns:
            List[Dict[str, str]]: List of videos.
        """
        has_more_data: bool = True

        data: List[Dict[str, Any]] = list()

        while has_more_data and len(data) < max_size:
            response: Union[Dict[str, Any], None] = self.fetch_data(url, query)

            if response is None:
                return data

            data.extend(response["data"]["videos"])

            has_more_data = response["data"]["has_more"]

            # Check if we have reached the max size or there is no more data
            if not has_more_data:
                return data

            query["cursor"] = response["data"]["cursor"]

            query["search_id"] = response["data"]["search_id"]

        return data

client = TiktokClient()

In [29]:
# query = {
#     "query": {
#         "and": [
#             {
#                 "operation": "IN",
#                 "field_name": "region_code",
#                 "field_values": ["Austria"],
#             },
#             {
#                 "operation": "EQ",
#                 "field_name": "hashtag_name",
#                 "field_values": ["taylorswift"],
#             },
#         ],
#         "not": [
#             {"operation": "EQ", "field_name": "video_length", "field_values": ["SHORT"]}
#         ],
#     },
#     "max_count": 100,
#     "start_date": "20240101",
#     "end_date": "20240222",
# }

# # url = "https://open.tiktokapis.com/v2/research/video/query/?fields=id,region_code,like_count,username,video_description,music_id,comment_count,share_count,view_count"
# url = "https://open.tiktokapis.com/v2/research/video/query/?fields=id,video_description"

# data = client.query(query=query, url=url)

{'client_key': 'awqs61f0xooywrw9', 'client_secret': '6Ur6CVOMZhA1UUU8rhtRNIx5MoTUlRyQ', 'grant_type': 'client_credentials'}


In [32]:
data = client.search(keywords=["taylorswift"], start_date="2024-01-01", max_size=10)

{'client_key': 'awqs61f0xooywrw9', 'client_secret': '6Ur6CVOMZhA1UUU8rhtRNIx5MoTUlRyQ', 'grant_type': 'client_credentials'}
<bound method Response.json of <Response [200]>>
{'client_key': 'awqs61f0xooywrw9', 'client_secret': '6Ur6CVOMZhA1UUU8rhtRNIx5MoTUlRyQ', 'grant_type': 'client_credentials'}
<bound method Response.json of <Response [200]>>


INFO:root:Collected 85 videos.


In [33]:
data

[{'create_time': '2024-01-31',
  'hashtag_names': ['taylorswift',
   'twerk',
   'free',
   'dance',
   'cute',
   'pretty',
   'superbowl',
   'speed',
   'plane',
   'trending',
   'viral',
   'swifties',
   'tiktok',
   'twitter',
   'foryou',
   '4u',
   'trend',
   'discovery',
   'bigg',
   'trendy',
   'id',
   'dancechallenge',
   'albania',
   'truck',
   '21savage',
   'fy',
   'fyp',
   'leak',
   'typ',
   'planecrash',
   'planeproblems',
   'gnb',
   'asmr',
   'temu',
   'britneyarmy',
   'blowthisup',
   'xvideos',
   'surroundsound',
   'reels',
   'planechallenge',
   'ceilingchallenge',
   'leaks',
   'traviskelce',
   'asmrsounds',
   'toryou',
   'planelovers',
   'foryoupage',
   'foryourpage',
   'fypage',
   'lenatheplug',
   'fypp',
   'planeedit',
   'yourrage',
   'kaicenat',
   'gnbofficial',
   'typpppppppppp',
   'ishowspeed',
   'ofleaks',
   'freeleaks',
   'ishowspeedclipz',
   'gnboffical',
   'gnbcollection',
   'britneyisnotfree',
   'gnbcouple',
   

In [38]:
data_df = pd.DataFrame(data)
data_df

Unnamed: 0,create_time,hashtag_names,id,like_count,music_id,video_description,comment_count,region_code,share_count,username,view_count,voice_to_text,effect_ids,playlist_id
0,2024-01-31,"[taylorswift, twerk, free, dance, cute, pretty...",7330415306508258606,0.0,7330415422976641834,Its code 138390143 to see her #gnb #gnbofficia...,0,US,0,loki3727,0,,,
1,2024-01-31,"[taylorswift, taylorswiftsongs, taylorsversion...",7330415285964410144,49.0,7330415331829205792,the willow porfomence on top #taylorsversion #...,3,fr,0,comic_useful,28,,,
2,2024-01-31,"[taylorswift, collaboration, dontletthisflop, ...",7330415271724862762,18.0,7330415390612294443,I love this edit and this girl so much @addie ...,3,us,0,thatswiftie131313,6,,,
3,2024-01-31,"[taylorswift, nicki, fyp, overwatch, irishtikt...",7330415265999621409,133.0,7330415315624053536,#fyp #overwatch #floptok #irishtiktok #nicki #...,17,fr,0,astiryah,5,,,
4,2024-01-31,"[taylorswift, oneday, taylorswiftedit, taylors...",7330415255614491910,20.0,6921404355770632194,“you weren’t mine to lose” ☁️ (im so ready to ...,1,co,3,willestears,171,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,2024-01-31,"[taylorswift, swifties, taylorswiftchallenge, ...",7330414249044479274,1012.0,7242381184442338074,EVERYBODY THINKS OUR LOVE IS FOR SHOW BUT I WO...,77,us,0,giselelizbth,271,,,
81,2024-01-31,"[taylorswift, foryou, conspiracy, fyp, satire,...",7330414207193599262,141.0,7330414357345585950,Idaho Joe speaks the truth about Taylor Swift!...,45,us,1,jadejesser,62,,,7.211529e+18
82,2024-01-31,"[taylorswift, oc, cosplay, foryou, bipride, ai...",7330414194585554206,57.0,7252896191606835201,Whoaaa 😂#fyp #foryou #morph #taylorswift #ai #...,37,us,0,skyelight84,15,,,
83,2024-01-31,"[taylorswift, kendricklamar, relatable, badblo...",7330414184561134890,77.0,7067382546014980911,I would take like a hr on it then like 2 views...,0,us,0,swifties.editz_,5,,,


In [42]:
for i, row in data_df.iterrows():
    print(row["video_description"])

Its code 138390143 to see her #gnb #gnbofficial #britneyarmy #britneyisnotfree #britneygno #gnbcouple #gnboffical #gnbcollection #plane #planeedit #planecrash #planechallenge #planelovers #planeproblems planemode #fyp #toryou#ofleaks #temuleaks #temu #free #freeleaks#ishowspeed #asmrsounds #asmr #fyp #viral #speed#trendinguser511655123781 #albania #foryou #ishowspeedclipz #leaks #leak #fy #fypage #viral #typpppppppppp #trending #trendy  #trend #temu #id #surroundsound #ceilingchallenge #dancechallenge #reels #tiktok #dance #Superbowl #swifties #taylorswift #traviskelce #speed##truck" #twitter #xvideos #21savage #kaicenat #ishowspeed #yourrage#lenatheplug #typ #foryoupage #foryourpage #viral #discovery #fypp #trending #twerk #bigg #fyp #4u #blowthisup #cute #leaks #pretty hhhgf
the willow porfomence on top #taylorsversion #taylorswifterastour #taylorswiftsongs #taylorswift #theerastour
I love this edit and this girl so much @addie #CapCut #taylorswift #dontletthisflop #collaboration 
#f