# Import thư viện cần thiết

In [1]:
import pandas as pd
import numpy as np
import os
import sys

# Khám phá dữ liệu về video

## Đọc dữ liệu video sau khi đã được xử lý

In [2]:
video_df = pd.read_parquet("preprocessed_videos.parquet")
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70996 entries, 0 to 70995
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype                           
---  ------                           --------------  -----                           
 0   CategoryType                     70996 non-null  object                          
 1   author.downloadSetting           70996 non-null  object                          
 2   author.duetSetting               70996 non-null  object                          
 3   author.id                        70996 non-null  object                          
 4   author.nickname                  70996 non-null  object                          
 5   author.openFavorite              70996 non-null  object                          
 6   author.secUid                    70996 non-null  object                          
 7   author.signature                 70996 non-null  object                          
 8   author.stitchSet

## Danh sách các user được đề cập trong dữ liệu về video

In [3]:
unique_author_ids = video_df["author.id"].unique()
print(f"Number of unique author ids: {len(unique_author_ids)}")

Number of unique author ids: 264


# Tiền xử lý dữ liệu về user

## Đọc dữ liệu user

Xác định trước kiểu dữ liệu phù hợp cho các cột trong dữ liệu về người dùng

In [4]:
user_dtypes = {
    "user.bioLink.risk": np.object_,
    "user.commerceUserInfo.category": np.object_,
    "user.commerceUserInfo.categoryButton": np.object_,
    "user.downloadSetting": np.object_,
    "user.duetSetting": np.object_,
    "user.followingVisibility": np.object_,
    "user.id": np.object_,
    "user.nickNameModifyTime": np.object_,
    "user.nickname": np.object_,
    "user.profileEmbedPermission": np.object_,
    "user.profileTab.showQuestionTab": np.object_,
    "user.relation": np.object_,
    "user.secUid": np.object_,
    "user.signature": np.object_,
    "user.stitchSetting": np.object_,
    "user.uniqueId": np.object_,
    "user.bioLink.link": np.object_,
    "user.roomId": np.object_,
}

In [5]:
user_df = pd.read_csv("data/interim/final_raw_users.csv", dtype=user_dtypes)
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 39 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   stats.diggCount                             264 non-null    int64  
 1   stats.followerCount                         264 non-null    int64  
 2   stats.followingCount                        264 non-null    int64  
 3   stats.friendCount                           264 non-null    int64  
 4   stats.heart                                 264 non-null    int64  
 5   stats.heartCount                            264 non-null    int64  
 6   stats.videoCount                            264 non-null    int64  
 7   user.bioLink.risk                           119 non-null    object 
 8   user.canExpPlaylist                         264 non-null    bool   
 9   user.commentSetting                         264 non-null    int64  
 10  user.commerceU

## Chỉ giữ lại các user có trong dữ liệu về video

In [6]:
# Check if the user ids in the video_df are present in the user_df
missing_user_ids = set(video_df["author.id"]) - set(user_df["user.id"])
if missing_user_ids:
    print(f"Missing user ids in user_df: {missing_user_ids}")
else:
    print("All user ids in video_df are present in user_df.")

All user ids in video_df are present in user_df.


## Loại bỏ các cột bị thiếu quá nhiều dữ liệu

Tính tỷ lệ thiếu giá trị của các cột

In [7]:
# Calculate the missing rate for each column in user_df
missing_rate = user_df.isnull().mean()
missing_rate = missing_rate[missing_rate > 0].sort_values(
    ascending=False).reset_index()
missing_rate.columns = ["column_name", "missing_rate"]
missing_rate["missing_rate"] = missing_rate["missing_rate"].apply(
    lambda x: f"{x:.2%}")
missing_rate = missing_rate.set_index("column_name")
# Print the missing rate for each column in user_df
print("Missing rate for each column in user_df:")
missing_rate

Missing rate for each column in user_df:


Unnamed: 0_level_0,missing_rate
column_name,Unnamed: 1_level_1
user.commerceUserInfo.downLoadLink.ios,100.00%
user.commerceUserInfo.downLoadLink.android,100.00%
user.roomId,99.62%
user.bioLink.link,95.45%
user.commerceUserInfo.categoryButton,83.33%
user.commerceUserInfo.category,83.33%
user.profileTab.showQuestionTab,70.83%
user.bioLink.risk,54.92%
user.nickNameModifyTime,6.44%
user.signature,1.89%


Loại bỏ tất cả các cột bị thiếu dữ liệu

In [8]:
# Remove all columns with more than 50% missing values
user_df = user_df.dropna(thresh=len(user_df) * 0.5, axis=1)

In [9]:
# Calculate the missing rate for each column in user_df
missing_rate = user_df.isnull().mean()
missing_rate = missing_rate[missing_rate > 0].sort_values(
    ascending=False).reset_index()
missing_rate.columns = ["column_name", "missing_rate"]
missing_rate["missing_rate"] = missing_rate["missing_rate"].apply(
    lambda x: f"{x:.2%}")
missing_rate = missing_rate.set_index("column_name")
# Print the missing rate for each column in user_df
print("Missing rate for each column in user_df:")
missing_rate

Missing rate for each column in user_df:


Unnamed: 0_level_0,missing_rate
column_name,Unnamed: 1_level_1
user.nickNameModifyTime,6.44%
user.signature,1.89%


Loại bỏ các cột chỉ có duy nhất một giá trị

In [10]:
# Loại bỏ các cột chỉ có một giá trị duy nhất
user_df = user_df.loc[:, user_df.nunique() > 1]
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   stats.diggCount                     264 non-null    int64 
 1   stats.followerCount                 264 non-null    int64 
 2   stats.followingCount                264 non-null    int64 
 3   stats.friendCount                   264 non-null    int64 
 4   stats.heart                         264 non-null    int64 
 5   stats.heartCount                    264 non-null    int64 
 6   stats.videoCount                    264 non-null    int64 
 7   user.commerceUserInfo.commerceUser  264 non-null    bool  
 8   user.downloadSetting                264 non-null    object
 9   user.duetSetting                    264 non-null    object
 10  user.followingVisibility            264 non-null    object
 11  user.id                             264 non-null    object

Điền giá trị "others" cho các cột có giá trị là "unknown"

In [11]:
# Điền giá trị "others" cho các cột không có kiểu dữ kiệu số 
non_numeric_columns = user_df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
    user_df[col] = user_df[col].fillna("others")

## Kết quả sau khi tiền xử lý dữ liệu về user

In [12]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   stats.diggCount                     264 non-null    int64 
 1   stats.followerCount                 264 non-null    int64 
 2   stats.followingCount                264 non-null    int64 
 3   stats.friendCount                   264 non-null    int64 
 4   stats.heart                         264 non-null    int64 
 5   stats.heartCount                    264 non-null    int64 
 6   stats.videoCount                    264 non-null    int64 
 7   user.commerceUserInfo.commerceUser  264 non-null    bool  
 8   user.downloadSetting                264 non-null    object
 9   user.duetSetting                    264 non-null    object
 10  user.followingVisibility            264 non-null    object
 11  user.id                             264 non-null    object

## Feature Engineering cho dữ liệu về user

### Tính số lượng video trung bình mà 1 user đăng tải trong mỗi tuần

In [23]:
# Group DataFrame by author.id and count the number of videos for each author
author_video_counts = video_df.groupby(
    "author.id").size().reset_index(name="video_count")

# Get the time of the first and last video for each author
author_first_last = video_df.groupby("author.id")[
    "createTime"].agg(["min", "max"]).reset_index()

# Merge the two DataFrames to get the video count along with the first and last video times
author_video_info = pd.merge(
    author_video_counts, author_first_last, on="author.id")

# Calculate the number of weeks between the first and last video
author_video_info["weeks_between"] = (
    author_video_info["max"] - author_video_info["min"]).dt.days / 7

# Calculate the average number of videos per week for each author
author_video_info["avg_videos_per_week"] = (
    author_video_info["video_count"] / author_video_info["weeks_between"]).fillna(0)

author_video_info.head()

Unnamed: 0,author.id,video_count,min,max,weeks_between,avg_videos_per_week
0,15475469,235,2023-11-27 18:29:27+07:00,2025-03-18 18:49:09+07:00,68.142857,3.448637
1,27202912,184,2023-11-25 19:13:30+07:00,2025-03-08 14:31:28+07:00,66.857143,2.752137
2,63937172755,204,2023-11-26 20:16:51+07:00,2025-03-17 18:42:14+07:00,68.0,3.0
3,6514849123081355266,318,2023-11-26 12:37:27+07:00,2025-03-15 13:25:09+07:00,67.857143,4.686316
4,6520772993827733506,104,2023-11-27 20:53:10+07:00,2025-03-13 20:29:37+07:00,67.285714,1.545648


In [30]:
author_video_info["avg_videos_per_week"].describe()

count    264.000000
mean       4.541401
std        3.021875
min        0.192797
25%        2.719030
50%        4.052632
75%        5.670334
max       25.846154
Name: avg_videos_per_week, dtype: float64

Chỉ giữ lại các cột cần thiết và đổi tên cột cho phù hợp

In [14]:
# Rename the columns to match the user_df
author_video_info = author_video_info.rename(
    columns={"author.id": "user.id"})

# Keep only the relevant columns
author_video_info = author_video_info[["user.id", "avg_videos_per_week"]]

In [15]:
author_video_info.shape

(264, 2)

Merge dữ liệu về user với dữ liệu về video

In [16]:
user_df = pd.merge(left=user_df, right=author_video_info, on="user.id", how="left")

### Tính một số giá trị trung bình khác trên mỗi video của user

Ta sẽ tính thêm một số giá trị khác như:

- Số lượt lưu video trung bình trên mỗi video (video - `statsV2.collectCount`)
- Số lượt bình luận trung bình trên mỗi video (video - `statsV2.commentCount`)
- Số lượt thích trung bình trên mỗi video (video - `statsV2.diggCount`)
- Số lượt xem trung bình trên mỗi video (video - `statsV2.playCount`)
- Số lượt chia sẻ trung bình trên mỗi video (video - `statsV2.shareCount`)
- Thời lượng trung bình của video (video - `video.duration`)
- Số lượng hashtag trung bình trên mỗi video (video - `hashtag_count`)


In [17]:
# Define interesting columns
interesting_columns = [
    "statsV2.collectCount", "statsV2.commentCount", "statsV2.diggCount",
    "statsV2.playCount", "statsV2.shareCount", "video.duration",
    "hashtag_count"
]

# Group DataFrame by `author.id` then calculate the mean
# of some interesting columns
avg_user_stats = video_df.groupby("author.id")[interesting_columns]\
                         .mean().reset_index()

# Rename the columns to match the user_df
avg_user_stats = avg_user_stats.rename(
    columns={
        "author.id": "user.id",
        "statsV2.collectCount": "avg_collects_per_video",
        "statsV2.commentCount": "avg_comments_per_video",
        "statsV2.diggCount": "avg_diggs_per_video",
        "statsV2.playCount": "avg_plays_per_video",
        "statsV2.shareCount": "avg_shares_per_video",
        "video.duration": "avg_video_duration",
        "hashtag_count": "avg_hashtags_per_video"
    }
)

# Merge the two DataFrames to get the average stats along with the author video info
user_df = pd.merge(
    left=user_df, right=avg_user_stats, on="user.id", how="left")

In [18]:
user_df.shape

(264, 30)

## Lữu dữ liệu thành file Parquet

In [19]:
# Save the cleaned user_df to a new parquet file
user_df.to_parquet("./data/processed/cleaned_user_info.parquet", index=False)

In [21]:
pd.read_parquet(
    "./data/processed/cleaned_user_info.parquet").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 30 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   stats.diggCount                     264 non-null    int64  
 1   stats.followerCount                 264 non-null    int64  
 2   stats.followingCount                264 non-null    int64  
 3   stats.friendCount                   264 non-null    int64  
 4   stats.heart                         264 non-null    int64  
 5   stats.heartCount                    264 non-null    int64  
 6   stats.videoCount                    264 non-null    int64  
 7   user.commerceUserInfo.commerceUser  264 non-null    bool   
 8   user.downloadSetting                264 non-null    object 
 9   user.duetSetting                    264 non-null    object 
 10  user.followingVisibility            264 non-null    object 
 11  user.id                             264 non-n

- `stats.diggCount`: Số lượng video hoặc bình luận mà 1 người dùng đã "like"

# ----------------------

# END

# ----------------------


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet(
    "data/processed/cleaned_user_info.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 32 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   stats.diggCount                     264 non-null    int64  
 1   stats.followerCount                 264 non-null    int64  
 2   stats.followingCount                264 non-null    int64  
 3   stats.friendCount                   264 non-null    int64  
 4   stats.heart                         264 non-null    int64  
 5   stats.heartCount                    264 non-null    int64  
 6   stats.videoCount                    264 non-null    int64  
 7   user.canExpPlaylist                 264 non-null    bool   
 8   user.commentSetting                 264 non-null    int64  
 9   user.commerceUserInfo.commerceUser  264 non-null    bool   
 10  user.downloadSetting                264 non-null    object 
 11  user.duetSetting                    264 non-n

In [3]:
df['avg_videos_per_week'].mean()

np.float64(4.541401010386333)

In [4]:
stats_cols = df.columns[df.columns.str.contains("stats")]
df[stats_cols].describe()

Unnamed: 0,stats.diggCount,stats.followerCount,stats.followingCount,stats.friendCount,stats.heart,stats.heartCount,stats.videoCount
count,264.0,264.0,264.0,264.0,264.0,264.0,264.0
mean,76.227273,336472.6,250.295455,89.867424,10214600.0,10214600.0,549.643939
std,701.109046,499545.8,760.305674,342.250983,17074460.0,17074460.0,409.126718
min,0.0,3927.0,0.0,0.0,71000.0,71000.0,13.0
25%,0.0,64425.0,21.75,7.0,1500000.0,1500000.0,256.0
50%,0.0,146950.0,76.0,25.0,3600000.0,3600000.0,460.0
75%,0.0,345875.0,229.5,66.25,9500000.0,9500000.0,752.25
max,9782.0,3000000.0,9398.0,4075.0,100200000.0,100200000.0,2298.0


In [6]:
df["stats.diggCount"].value_counts()

stats.diggCount
0       259
4623      1
9782      1
3137      1
1829      1
753       1
Name: count, dtype: int64