# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Đọc dữ liệu từ file 

In [2]:
df = pd.read_parquet("./preprocessed_videos.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70996 entries, 0 to 70995
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   CategoryType                     70996 non-null  object        
 1   author.downloadSetting           70996 non-null  object        
 2   author.duetSetting               70996 non-null  object        
 3   author.id                        70996 non-null  object        
 4   author.nickname                  70996 non-null  object        
 5   author.openFavorite              70996 non-null  bool          
 6   author.secUid                    70996 non-null  object        
 7   author.signature                 70996 non-null  object        
 8   author.stitchSetting             70996 non-null  object        
 9   author.uniqueId                  70996 non-null  object        
 10  author.verified                  70996 non-null  bool     

# Đọc danh sách filter cho author

In [3]:
filtered_authors = None
with open("./data/filters/author_unique_ids.txt", "r") as f:
    filtered_authors = f.read().splitlines()

print(f"Number of filtered authors: {len(filtered_authors)}")

Number of filtered authors: 6


In [4]:
# Print number of videos per author
for author_id in filtered_authors:
    url = f"https://www.tiktok.com/@{author_id}"

    # Print author_id, number of videos, and URL
    print(f"Author: {author_id}")
    print(
        f"Number of videos: {df[df['author.uniqueId'] == author_id].shape[0]}")
    print(f"URL: {url}")
    print()

Author: spicykim9386
Number of videos: 298
URL: https://www.tiktok.com/@spicykim9386

Author: haidangrevieww
Number of videos: 255
URL: https://www.tiktok.com/@haidangrevieww

Author: khaikhampha
Number of videos: 104
URL: https://www.tiktok.com/@khaikhampha

Author: putaangi
Number of videos: 258
URL: https://www.tiktok.com/@putaangi

Author: trangtam2607
Number of videos: 169
URL: https://www.tiktok.com/@trangtam2607

Author: huynhanhtuan_dienvien
Number of videos: 251
URL: https://www.tiktok.com/@huynhanhtuan_dienvien



# Lọc dữ liệu theo danh sách filter


In [5]:
n_videos = 298 + 255 + 104 + 258 + 169 + 251
print(f"Total number of videos: {n_videos}")

Total number of videos: 1335


In [6]:
# Filter row with filtered authors
# then reset index to avoid missing index
filtered_authors_df = df[df["author.uniqueId"].isin(filtered_authors)]
filtered_authors_df = filtered_authors_df.reset_index(drop=True)
assert filtered_authors_df.shape[0] == n_videos
filtered_authors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335 entries, 0 to 1334
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   CategoryType                     1335 non-null   object        
 1   author.downloadSetting           1335 non-null   object        
 2   author.duetSetting               1335 non-null   object        
 3   author.id                        1335 non-null   object        
 4   author.nickname                  1335 non-null   object        
 5   author.openFavorite              1335 non-null   bool          
 6   author.secUid                    1335 non-null   object        
 7   author.signature                 1335 non-null   object        
 8   author.stitchSetting             1335 non-null   object        
 9   author.uniqueId                  1335 non-null   object        
 10  author.verified                  1335 non-null   bool       

In [7]:
# Save data to parquet, without index
filtered_authors_df.to_parquet("./filtered_authors.parquet", index=False)

In [8]:
pd.read_parquet("./filtered_authors.parquet").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335 entries, 0 to 1334
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   CategoryType                     1335 non-null   object        
 1   author.downloadSetting           1335 non-null   object        
 2   author.duetSetting               1335 non-null   object        
 3   author.id                        1335 non-null   object        
 4   author.nickname                  1335 non-null   object        
 5   author.openFavorite              1335 non-null   bool          
 6   author.secUid                    1335 non-null   object        
 7   author.signature                 1335 non-null   object        
 8   author.stitchSetting             1335 non-null   object        
 9   author.uniqueId                  1335 non-null   object        
 10  author.verified                  1335 non-null   bool       