# Import thư viện cần thiết

In [1]:
import pandas as pd
import numpy as np
import os
import sys

# Khám phá dữ liệu về video

## Đọc dữ liệu video sau khi đã được xử lý

In [5]:
video_df = pd.read_parquet("preprocessed_videos.parquet")
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70996 entries, 0 to 70995
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype                           
---  ------                           --------------  -----                           
 0   CategoryType                     70996 non-null  object                          
 1   author.downloadSetting           70996 non-null  object                          
 2   author.duetSetting               70996 non-null  object                          
 3   author.id                        70996 non-null  object                          
 4   author.nickname                  70996 non-null  object                          
 5   author.openFavorite              70996 non-null  object                          
 6   author.secUid                    70996 non-null  object                          
 7   author.signature                 70996 non-null  object                          
 8   author.stitchSet

## Danh sách các user được đề cập trong dữ liệu về video

In [9]:
unique_author_ids = video_df["author.id"].unique()
print(f"Number of unique author ids: {len(unique_author_ids)}")

Number of unique author ids: 264


# Tiền xử lý dữ liệu về user

## Đọc dữ liệu user

Xác định trước kiểu dữ liệu phù hợp cho các cột trong dữ liệu về người dùng

In [20]:
user_dtypes = {
    "user.bioLink.risk": np.object_,
    "user.commerceUserInfo.category": np.object_,
    "user.commerceUserInfo.categoryButton": np.object_,
    "user.downloadSetting": np.object_,
    "user.duetSetting": np.object_,
    "user.followingVisibility": np.object_,
    "user.id": np.object_,
    "user.nickNameModifyTime": np.object_,
    "user.nickname": np.object_,
    "user.profileEmbedPermission": np.object_,
    "user.profileTab.showQuestionTab": np.object_,
    "user.relation": np.object_,
    "user.secUid": np.object_,
    "user.signature": np.object_,
    "user.stitchSetting": np.object_,
    "user.uniqueId": np.object_,
    "user.bioLink.link": np.object_,
    "user.roomId": np.object_,
}

In [21]:
user_df = pd.read_csv("data/interim/final_raw_users.csv", dtype=user_dtypes)
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 39 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   stats.diggCount                             264 non-null    int64  
 1   stats.followerCount                         264 non-null    int64  
 2   stats.followingCount                        264 non-null    int64  
 3   stats.friendCount                           264 non-null    int64  
 4   stats.heart                                 264 non-null    int64  
 5   stats.heartCount                            264 non-null    int64  
 6   stats.videoCount                            264 non-null    int64  
 7   user.bioLink.risk                           119 non-null    object 
 8   user.canExpPlaylist                         264 non-null    bool   
 9   user.commentSetting                         264 non-null    int64  
 10  user.commerceU

## Chỉ giữ lại các user có trong dữ liệu về video

In [22]:
# Check if the user ids in the video_df are present in the user_df
missing_user_ids = set(video_df["author.id"]) - set(user_df["user.id"])
if missing_user_ids:
    print(f"Missing user ids in user_df: {missing_user_ids}")
else:
    print("All user ids in video_df are present in user_df.")

All user ids in video_df are present in user_df.


## Loại bỏ các cột bị thiếu quá nhiều dữ liệu

Tính tỷ lệ thiếu giá trị của các cột

In [25]:
# Calculate the missing rate for each column in user_df
missing_rate = user_df.isnull().mean()
missing_rate = missing_rate[missing_rate > 0].sort_values(
    ascending=False).reset_index()
missing_rate.columns = ["column_name", "missing_rate"]
missing_rate["missing_rate"] = missing_rate["missing_rate"].apply(
    lambda x: f"{x:.2%}")
missing_rate = missing_rate.set_index("column_name")
# Print the missing rate for each column in user_df
print("Missing rate for each column in user_df:")
missing_rate

Missing rate for each column in user_df:


Unnamed: 0_level_0,missing_rate
column_name,Unnamed: 1_level_1
user.commerceUserInfo.downLoadLink.ios,100.00%
user.commerceUserInfo.downLoadLink.android,100.00%
user.roomId,99.62%
user.bioLink.link,95.45%
user.commerceUserInfo.categoryButton,83.33%
user.commerceUserInfo.category,83.33%
user.profileTab.showQuestionTab,70.83%
user.bioLink.risk,54.92%
user.nickNameModifyTime,6.44%
user.signature,1.89%


Loại bỏ tất cả các cột bị thiếu dữ liệu

In [26]:
# Remove all columns with more than 50% missing values
user_df = user_df.dropna(thresh=len(user_df) * 0.5, axis=1)

In [27]:
# Calculate the missing rate for each column in user_df
missing_rate = user_df.isnull().mean()
missing_rate = missing_rate[missing_rate > 0].sort_values(
    ascending=False).reset_index()
missing_rate.columns = ["column_name", "missing_rate"]
missing_rate["missing_rate"] = missing_rate["missing_rate"].apply(
    lambda x: f"{x:.2%}")
missing_rate = missing_rate.set_index("column_name")
# Print the missing rate for each column in user_df
print("Missing rate for each column in user_df:")
missing_rate

Missing rate for each column in user_df:


Unnamed: 0_level_0,missing_rate
column_name,Unnamed: 1_level_1
user.nickNameModifyTime,6.44%
user.signature,1.89%


Loại bỏ các cột chỉ có duy nhất một giá trị

In [31]:
# Loại bỏ các cột chỉ có một giá trị duy nhất
user_df = user_df.loc[:, user_df.nunique() > 1]
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   stats.diggCount                     264 non-null    int64 
 1   stats.followerCount                 264 non-null    int64 
 2   stats.followingCount                264 non-null    int64 
 3   stats.friendCount                   264 non-null    int64 
 4   stats.heart                         264 non-null    int64 
 5   stats.heartCount                    264 non-null    int64 
 6   stats.videoCount                    264 non-null    int64 
 7   user.commerceUserInfo.commerceUser  264 non-null    bool  
 8   user.downloadSetting                264 non-null    object
 9   user.duetSetting                    264 non-null    object
 10  user.followingVisibility            264 non-null    object
 11  user.id                             264 non-null    object

Điền giá trị "others" cho các cột có giá trị là "unknown"

In [32]:
# Điền giá trị "others" cho các cột không có kiểu dữ kiệu số 
non_numeric_columns = user_df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
    user_df[col] = user_df[col].fillna("others")

## Kết quả sau khi tiền xử lý dữ liệu về user

In [34]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   stats.diggCount                     264 non-null    int64 
 1   stats.followerCount                 264 non-null    int64 
 2   stats.followingCount                264 non-null    int64 
 3   stats.friendCount                   264 non-null    int64 
 4   stats.heart                         264 non-null    int64 
 5   stats.heartCount                    264 non-null    int64 
 6   stats.videoCount                    264 non-null    int64 
 7   user.commerceUserInfo.commerceUser  264 non-null    bool  
 8   user.downloadSetting                264 non-null    object
 9   user.duetSetting                    264 non-null    object
 10  user.followingVisibility            264 non-null    object
 11  user.id                             264 non-null    object

## Lữu dữ liệu thành file Parquet

In [35]:
# Save the cleaned user_df to a new parquet file
user_df.to_parquet("./data/processed/cleaned_user_info.parquet", index=False)

In [38]:
pd.read_parquet(
    "./data/processed/cleaned_user_info.parquet").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   stats.diggCount                     264 non-null    int64 
 1   stats.followerCount                 264 non-null    int64 
 2   stats.followingCount                264 non-null    int64 
 3   stats.friendCount                   264 non-null    int64 
 4   stats.heart                         264 non-null    int64 
 5   stats.heartCount                    264 non-null    int64 
 6   stats.videoCount                    264 non-null    int64 
 7   user.commerceUserInfo.commerceUser  264 non-null    bool  
 8   user.downloadSetting                264 non-null    object
 9   user.duetSetting                    264 non-null    object
 10  user.followingVisibility            264 non-null    object
 11  user.id                             264 non-null    object