# TikTok Comment Scraper

This notebook implements a TikTok comment scraper that can fetch and analyze comments from TikTok videos. The scraper saves the comments in JSON format and provides functionality to analyze the data.

> Note: Make sure you have a valid TikTok video ID before running this code.

In [None]:
# !pip install requests pandas matplotlib

# choose the appropriate command based on your environment

%pip install requests pandas matplotlib openpyxl

In [None]:
import requests
import logging
import json
import os
from datetime import datetime
import pandas as pd
import openpyxl
import matplotlib.pyplot as plt

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [ %(levelname)s ]\t:: %(message)s',
    datefmt='%Y-%m-%dT%H:%M:%S'
)

In [None]:
class Comment:
    def __init__(self) -> None:
        self.__result = {
            'caption': None,
            'date_now': None,
            'video_url': None,
            'comments': []
        }
    
    def __format_date(self, milisecond: int) -> str:
        try:
            return datetime.fromtimestamp(milisecond).strftime('%Y-%m-%dT%H:%M:%S')
        except:
            return datetime.fromtimestamp(milisecond / 1000).strftime('%Y-%m-%dT%H:%M:%S')
    
    def __get_replies(self, commentid: str) -> list:
        [data, i] = [[], 0]
        
        while True:
            res = requests.get(
                f'https://www.tiktok.com/api/comment/list/reply/?aid=1988&comment_id={commentid}&count=9999999&cursor={i * 50}'
            ).json()
            
            if not res['comments']:
                break
                
            data += res['comments']
            i += 1
        
        return self.__filter_comments(data)
    
    def __filter_comments(self, comments: list) -> list:
        new_comments: list = []
        
        for comment in comments:
            if comment['share_info']['desc']:
                logging.info(comment['share_info']['desc'])
            
            new_comment = {
                'username': comment['user']['unique_id'],
                'nickname': comment['user']['nickname'],
                'comment': comment['text'],
                'create_time': self.__format_date(comment['create_time']),
                'avatar': comment['user']['avatar_thumb']['url_list'][0]
            }
            
            try:
                new_comment.update({
                    'total_reply': comment['reply_comment_total'],
                    'replies': self.__get_replies(comment['cid']) if comment['reply_comment_total'] > 0 else []
                })
            except:
                pass
            
            new_comments.append(new_comment)
        
        return new_comments

    def execute(self, videoid: str, size: int, save_formats: list = ['json']) -> None:
        logging.info(f'Starting Scrapping for video with id {videoid}[{size} - {size + 50}]...')
        
        res = requests.get(
            f'https://www.tiktok.com/api/comment/list/?aid=1988&aweme_id={videoid}&count=9999999&cursor={size}'
        ).json()
        
        if res['status_code'] > 0:
            return logging.error('invalid id video')
        
        try:
            self.__result['caption'] = res['comments'][0]['share_info']['title']
            self.__result['date_now'] = self.__format_date(res['extra']['now'])
            self.__result['video_url'] = res['comments'][0]['share_info']['url']
            self.__result['comments'] = self.__filter_comments(res['comments'])
            
            # Save to selected formats
            self.save_to_drive(videoid, size, self.__result, save_formats)
            
        except Exception as e:
            logging.error(f'Error: {str(e)}')
            return logging.error('comments are over')
        
        return self.__result
    
    def save_to_drive(self, videoid: str, size: int, data: dict, save_formats: list = ['json']) -> None:
        """Save the comments data in multiple formats"""
        # Create directory for this video if it doesn't exist
        directory = f'./TikTok_Comments/{videoid}'
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        # Convert comments to DataFrame for CSV and Excel export
        df = pd.DataFrame(data['comments'])
        
        for fmt in save_formats:
            fmt = fmt.lower()
            
            if fmt == 'json':
                # Save the range file (e.g., 0-50.json)
                range_file = f'{directory}/{size}-{size + 50}.json'
                with open(range_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
                logging.info(f'Saved comments to {range_file}')
                
                # Update or create full.json
                full_file = f'{directory}/full.json'
                if os.path.exists(full_file):
                    with open(full_file, 'r', encoding='utf-8') as f:
                        full_data = json.load(f)
                        # Update the comments list
                        full_data['comments'].extend(data['comments'])
                else:
                    full_data = data
                
                with open(full_file, 'w', encoding='utf-8') as f:
                    json.dump(full_data, f, ensure_ascii=False, indent=4)
                logging.info(f'Updated {full_file}')
            
            elif fmt == 'csv':
                # Save as CSV
                csv_file = f'{directory}/{size}-{size + 50}.csv'
                df.to_csv(csv_file, index=False, encoding='utf-8')
                logging.info(f'Saved comments to {csv_file}')
                
                # Update or create full.csv
                full_csv = f'{directory}/full.csv'
                if os.path.exists(full_csv):
                    full_df = pd.read_csv(full_csv)
                    full_df = pd.concat([full_df, df], ignore_index=True)
                else:
                    full_df = df
                full_df.to_csv(full_csv, index=False, encoding='utf-8')
                logging.info(f'Updated {full_csv}')
            
            elif fmt == 'excel':
                # Save as Excel
                excel_file = f'{directory}/{size}-{size + 50}.xlsx'
                df.to_excel(excel_file, index=False)
                logging.info(f'Saved comments to {excel_file}')
                
                # Update or create full.xlsx
                full_excel = f'{directory}/full.xlsx'
                if os.path.exists(full_excel):
                    full_df = pd.read_excel(full_excel)
                    full_df = pd.concat([full_df, df], ignore_index=True)
                else:
                    full_df = df
                full_df.to_excel(full_excel, index=False)
                logging.info(f'Updated {full_excel}')
            
            else:
                logging.warning(f'Unsupported format: {fmt}')

## 2. Test the Comment Scraper

Now let's test the comment scraper with a sample TikTok video ID. Make sure to replace the video ID with a valid one.

In [None]:
# Create an instance of the Comment class
comment = Comment()

# Replace with your TikTok video ID
video_id = "7483921926029643013"  # Example video ID
size = 0  # Start from the beginning

# Execute the scraper with multiple save formats
result = comment.execute(video_id, size, save_formats=['json', 'csv', 'excel'])

if result:
    print(f"\nSuccessfully scraped comments from video {video_id}")
    print(f"Check the TikTok_Comments/{video_id}/ folder for the following files:")
    print("- JSON files: full.json and {size}-{size+50}.json")
    print("- CSV files: full.csv and {size}-{size+50}.csv")
    print("- Excel files: full.xlsx and {size}-{size+50}.xlsx")

## 3. Analyze the Comments

Let's analyze the scraped comments using pandas and create some visualizations.

The analysis now includes:
1. Support for reading from JSON, CSV, or Excel files
2. Basic statistics including total comments and unique users
3. Time analysis showing the first and last comments
4. Top 10 most active commenters visualization
5. Word frequency analysis showing most common words in comments

You can analyze data from any of the saved formats by specifying the format parameter:

In [None]:
def analyze_comments(video_id: str, format: str = 'json'):
    """Analyze comments from the specified format (json, csv, or excel)"""
    directory = f'./TikTok_Comments/{video_id}'
    format = format.lower()
    
    # Read the full file based on format
    if format == 'json':
        full_file = f'{directory}/full.json'
        with open(full_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        comments_df = pd.DataFrame(data['comments'])
    elif format == 'csv':
        full_file = f'{directory}/full.csv'
        comments_df = pd.read_csv(full_file)
    elif format == 'excel':
        full_file = f'{directory}/full.xlsx'
        comments_df = pd.read_excel(full_file)
    else:
        raise ValueError(f'Unsupported format: {format}')
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(f"Total comments: {len(comments_df)}")
    print(f"Unique users: {comments_df['username'].nunique()}")
    if 'total_reply' in comments_df.columns:
        print(f"Average replies per comment: {comments_df['total_reply'].mean():.2f}")
    
    # Time analysis if create_time is available
    if 'create_time' in comments_df.columns:
        comments_df['create_time'] = pd.to_datetime(comments_df['create_time'])
        print(f"\nTime Range:")
        print(f"First comment: {comments_df['create_time'].min()}")
        print(f"Last comment: {comments_df['create_time'].max()}")
    
    # Plot top commenters
    plt.figure(figsize=(12, 6))
    comments_df['username'].value_counts().head(10).plot(kind='bar')
    plt.title('Top 10 Commenters')
    plt.xlabel('Username')
    plt.ylabel('Number of Comments')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Word frequency in comments (if text column is available)
    if 'comment' in comments_df.columns:
        print("\nMost Common Words in Comments:")
        words = ' '.join(comments_df['comment']).lower().split()
        word_freq = pd.Series(words).value_counts().head(10)
        print(word_freq)

if 'result' in locals():
    # Try to analyze in all available formats
    for format in ['json', 'csv', 'excel']:
        try:
            print(f"\nAnalyzing data from {format.upper()} format:")
            analyze_comments(video_id, format)
        except Exception as e:
            print(f"Could not analyze {format} format: {str(e)}")

## 4. Kesimpulan

Notebook ini menunjukkan cara:  
1. Mengambil (scrape) komentar dari video TikTok  
2. Menyimpan data ke ke direktori  
3. Menganalisis komentar menggunakan pandas  
4. Memvisualisasikan hasil analisis  

Anda dapat memodifikasi bagian analisis untuk mendapatkan wawasan lebih lanjut dari data komentar sesuai kebutuhan.
<div align="center">

&copy; 2024 Sleeper Build. All rights reserved.

</div>