In [36]:
%pip install plotly nbformat

Note: you may need to restart the kernel to use updated packages.


In [37]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ast
import re

import os
from tqdm import tqdm
from typing import List
import pandas as pd
from collections import Counter


# Load Data

In [38]:
file_path = "../../data/interim/video_info.csv"
video_info_df = pd.read_csv(file_path, low_memory=False)

# Transform Data

## Loại bỏ các cột có tỷ lệ thiếu dữ liệu lớn hơn 50%

In [39]:
# Calculate the missing ratio
missing_ratio = video_info_df.isna().sum() / len(video_info_df)

# Sort the missing ratio in descending order
missing_ratio = missing_ratio.sort_values(ascending=False)

# Display first 5 rows
missing_ratio.head()

AIGCDescription               1.0
backendSourceEventTracking    1.0
music.coverThumb              1.0
music.coverMedium             1.0
music.coverLarge              1.0
dtype: float64

In [40]:
# Remove columns with missing ratios greater than 0.50
video_info_df = video_info_df.dropna(
    axis='columns', thresh=0.50 * len(video_info_df)
)

In [41]:
# Calculate the missing ratio
missing_ratio = video_info_df.isna().sum() / len(video_info_df)

# Sort the missing ratio in descending order
missing_ratio = missing_ratio.sort_values(ascending=False)

# Display the missing ratio
for column, ratio in missing_ratio[:5].items():
    print(f"{column:50}:{ratio:8.2%}")

duetEnabled                                       :  20.29%
stitchEnabled                                     :  19.87%
video.claInfo.originalLanguageInfo.languageID     :  18.91%
video.claInfo.originalLanguageInfo.languageCode   :  18.91%
video.claInfo.originalLanguageInfo.language       :  18.91%


## Drop stats.*
Vì các cột có tên bắt đầu với `stats.*` chứa cùng thông tin với các cột có tên bắt đầu với `statsV2.*`, nhưng không có thông tin về `repostCount` như `statsV2.*`. Nên ta sẽ loại bỏ các cột có tên bắt đầu với `stats.*`.

In [42]:
# Remove columns starting with "stats."
video_info_df = video_info_df[
    [column for column in video_info_df.columns
            if not column.startswith("stats.")]
]

# video_info_df.info()

## Drop `video.claInfo.originalLanguageInfo.*`
Xóa các cột bắt đầu với `video.claInfo.originalLanguageInfo.*` vì chúng chứa thông tin không cần thiết.


In [43]:
# Remove columns starting with "video.claInfo.originalLanguageInfo."
video_info_df = video_info_df[
    [column for column in video_info_df.columns
            if not column.startswith("video.claInfo.originalLanguageInfo.")]
]

# video_info_df.info()

## Tạo cột `hashtags`
Tạo một cột chứa danh sách các `hashtag` được trích xuất từ mô tả video. Và tính số lượng hashtag trong mỗi video.

In [44]:
# Replace missing values in "desc" column with an empty string
video_info_df["desc"] = video_info_df["desc"].fillna("")
video_info_df["desc"] = video_info_df["desc"].astype(str)
video_info_df["desc"] = video_info_df["desc"].str.strip()

# Create a new column for the hashtags
# and the number of hashtags in each video
video_info_df["hashtags"] = [""] * len(video_info_df)
video_info_df["num_hashtags"] = [0] * len(video_info_df)

# Extract hashtags from the "desc" column
# and Get the number of hashtags in each video
for index in tqdm(range(len(video_info_df))):
    # Get the description of the video
    description = video_info_df.loc[index, "desc"].strip().lower()

    if description:
        # Remove emojis
        description = description.encode('ascii', 'ignore').decode('ascii')

        # Add a space before all "#" characters
        description = description.replace("#", " #")

        # Find all strings starting with "#" and followed by a word
        hashtags = [word[1:] for word in description.split()
                    if word.startswith("#")]

        # Extract hashtags from the description
        video_info_df.loc[index, "hashtags"] = ",".join(hashtags).strip()

        # Get the number of hashtags
        video_info_df.loc[index, "num_hashtags"] = len(hashtags)
    else:
        video_info_df.loc[index, "hashtags"] = ""
        video_info_df.loc[index, "num_hashtags"] = 0

100%|██████████| 32603/32603 [00:08<00:00, 4034.31it/s]


In [45]:
video_info_df[["hashtags", "num_hashtags"]].sample(n=5)

Unnamed: 0,hashtags,num_hashtags
18412,"ncungtiktok,cavienchien,xuhuongtiktok,anvat",4
3740,"annhungkhongbeo,learnontiktok,reviewanngon,anc...",4
26607,"thangmucbang,learnontiktok,reviewanngon,ancung...",4
20356,"learnontiktok,ancungtiktok,muoitoday,gmvmoxi,c...",7
16322,"longervideos,tiktokphilippines,tiktokph,learni...",16


In [46]:
video_info_df

Unnamed: 0,CategoryType,author.commentSetting,author.downloadSetting,author.duetSetting,author.ftc,author.id,author.isADVirtual,author.isEmbedBanned,author.nickname,author.openFavorite,...,video.ratio,video.videoID,video.videoQuality,video.volumeInfo.Loudness,video.volumeInfo.Peak,video.width,collectTime,video.claInfo.captionsType,hashtags,num_hashtags
0,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,540p,v10044g50000cv3gi77og65kch2tg7f0,normal,-8.4,1.00000,576,1741176061,,"1phutsaigon,saigon,saigondidau,saigonangi,kemc...",6
1,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,540p,v10044g50000cv2renvog65u6nli8ukg,normal,-8.4,1.00000,576,1741176061,,"1phutsaigon,saigon,saigondidau,cafesaigon,lipo...",6
2,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,540p,v14044g50000cv1951vog65tqj9a2l20,normal,-8.3,1.00000,576,1741176061,,"1phutsaigon,saigon,saigondidau,nhamcoffee,nham...",5
3,105,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,540p,v14044g50000cv0rc9fog65tk2jqih4g,normal,-7.6,1.00000,576,1741176061,,"1phutsaigon,saigon,saigondidau,trienlamnghethu...",6
4,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,540p,v10044g50000cv0778nog65mbjr0l8m0,normal,-5.5,1.00000,576,1741176061,,"1phutsaigon,saigon,saigondidau,rooftopsaigon,c...",6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32598,111,0,3,0,False,6644110999556227073,False,False,Quynh Truong,False,...,540p,v14025g50000clgtndfog65vbe2jtupg,normal,-17.9,0.77625,576,1741177528,1.0,"dememoria,bodymist",2
32599,111,0,3,0,False,6644110999556227073,False,False,Quynh Truong,False,...,540p,v10025g50000cleu8h7og65j2vs1fq7g,normal,-13.8,1.00000,576,1741177528,,,0
32600,111,0,3,0,False,6644110999556227073,False,False,Quynh Truong,False,...,540p,v10025g50000clcab77og65ta44tket0,normal,-14.5,0.94406,576,1741177528,,cloudteamochi,1
32601,111,0,3,0,False,6644110999556227073,False,False,Quynh Truong,False,...,540p,v10025g50000clbl8pfog65q729n4q4g,normal,-11.4,1.00000,576,1741177528,1.0,"mukbang,nutriboost,hngvhylp",3


In [49]:
video_info_df['hashtags'] = video_info_df['hashtags'].apply(lambda x: x.split(',') if isinstance(x, str) and x.strip() else [])
video_info_df["hashtags"] = video_info_df["hashtags"].apply(lambda x: x if isinstance(x, list) else [])


In [50]:
video_info_df['createTime'] = pd.to_datetime(video_info_df['createTime'], unit='s')


In [51]:
video_info_df['createTime']

0       2025-03-04 14:08:17
1       2025-03-03 14:07:35
2       2025-03-01 04:55:00
3       2025-02-28 13:15:37
4       2025-02-27 14:16:53
                ...        
32598   2023-11-25 11:38:37
32599   2023-11-22 11:24:12
32600   2023-11-18 11:55:32
32601   2023-11-17 11:57:23
32602   2025-03-05 12:24:27
Name: createTime, Length: 32603, dtype: datetime64[ns]

# Export into csv

In [63]:
video_info_df.to_csv('../../data/processed/video_infor.csv')

# Visualization

In [52]:
video_info_df['hashtags']

0        [1phutsaigon, saigon, saigondidau, saigonangi,...
1        [1phutsaigon, saigon, saigondidau, cafesaigon,...
2        [1phutsaigon, saigon, saigondidau, nhamcoffee,...
3        [1phutsaigon, saigon, saigondidau, trienlamngh...
4        [1phutsaigon, saigon, saigondidau, rooftopsaig...
                               ...                        
32598                                [dememoria, bodymist]
32599                                                   []
32600                                      [cloudteamochi]
32601                      [mukbang, nutriboost, hngvhylp]
32602                               [cheesecoffee, diudao]
Name: hashtags, Length: 32603, dtype: object

In [60]:

def analyze_hashtag_trends(video_info_df, top_n=5):
    """Analyzes top trending hashtags over time based on views."""

    # ✅ Check required columns
    required_columns = ["hashtags", "statsV2.playCount", "createTime"]
    for col in required_columns:
        if col not in video_info_df.columns:
            raise KeyError(f"⚠️ Missing column: {col}")

    # ✅ Ensure statsV2.playCount is numeric and handle NaN values
    video_info_df["statsV2.playCount"] = pd.to_numeric(video_info_df["statsV2.playCount"], errors="coerce").fillna(0)

    # ✅ Initialize Counter to count total views per hashtag
    hashtag_views = Counter()
    
    for _, row in video_info_df.iterrows():
        if isinstance(row["hashtags"], list) and len(row["hashtags"]) > 0:  # Ensure it's a valid list
            for hashtag in row["hashtags"]:
                hashtag_views[hashtag] += row["statsV2.playCount"]

    # ✅ Get the top N hashtags by total views
    top_hashtags = [hashtag for hashtag, _ in hashtag_views.most_common(top_n)]
    
    if not top_hashtags:
        raise ValueError("⚠️ No top hashtags found! Ensure your dataset contains valid hashtags and engagement metrics.")

    # ✅ Aggregate engagement over time for only the top hashtags
    trends_data = []
    for _, row in video_info_df.iterrows():
        if isinstance(row["hashtags"], list) and len(row["hashtags"]) > 0:
            for hashtag in row["hashtags"]:
                if hashtag in top_hashtags:
                    trends_data.append({
                        "createTime": row["createTime"],
                        "hashtag": hashtag,
                        "views": row["statsV2.playCount"]
                    })

    # ✅ Convert to DataFrame
    trends_df = pd.DataFrame(trends_data)

    if trends_df.empty:
        raise ValueError("⚠️ No trending hashtag data found! Check if your dataset has valid engagement metrics.")

    # ✅ Group by time and hashtag
    trends_df = trends_df.groupby(["createTime", "hashtag"]).sum().reset_index()
    
    return trends_df

In [61]:
def plot_hashtag_trends(trends_df):
    """Plots trending hashtags over time based on views."""

    # ✅ Plot the trends
    fig = px.line(trends_df, 
                  x="createTime", 
                  y="views", 
                  color="hashtag",
                  title="📈 Hashtag Popularity Over Time (Views)",
                  labels={"createTime": "Date", "views": "Total Views"},
                  markers=True)

    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

In [62]:
# Step 1: Analyze hashtag trends
trends_df = analyze_hashtag_trends(video_info_df, top_n=5)

# Step 2: Plot the trends
plot_hashtag_trends(trends_df)

  sf: grouped.get_group(s if len(s) > 1 else s[0])
  v = v.dt.to_pydatetime()


In [59]:
video_info_df['statsV2.playCount']

0          162400
1           25800
2           41600
3           34800
4            3413
           ...   
32598     3800000
32599     4200000
32600     2400000
32601    42900000
32602        7206
Name: statsV2.playCount, Length: 32603, dtype: int64

# Viz 2

In [72]:

def analyze_hashtag_count_effect(df):
    # Count number of hashtags in each video
    df['num_hashtags'] = df['hashtags'].apply(len)
    
    # Define finer bins for hashtag count grouping
    bins = [0, 2, 5, 8, 12, 15, 20, 30, float('inf')]
    labels = ['1-2', '3-5', '6-8', '9-12', '13-15', '16-20', '21-30', '30+']
    
    # Categorize into groups
    df['hashtag_group'] = pd.cut(df['num_hashtags'], bins=bins, labels=labels, right=False)

    # Aggregate engagement metrics for each group
    hashtag_effect = df.groupby(['hashtag_group'], observed=True).agg(
        avg_views=('statsV2.playCount', 'mean'),
        avg_likes=('statsV2.diggCount', 'mean'),
        avg_shares=('statsV2.shareCount', 'mean'),
        avg_comments=('statsV2.commentCount', 'mean'),
        count=('hashtag_group', 'size')
    ).reset_index()

    return hashtag_effect

In [73]:
def plot_hashtag_count_effect(hashtag_effect_df):
    fig = px.bar(
        hashtag_effect_df.melt(id_vars=['hashtag_group'], 
                               value_vars=['avg_views', 'avg_likes', 'avg_shares', 'avg_comments'],
                               var_name='Engagement Metric', value_name='Average Engagement'),
        x='hashtag_group', 
        y='Average Engagement', 
        color='Engagement Metric', 
        barmode='group',
        title="Effect of Hashtag Count on Engagement",
        labels={'hashtag_group': 'Number of Hashtags', 'Average Engagement': 'Average Value'}
    )
    
    fig.update_layout(xaxis_title="Hashtag Count Group", 
                      yaxis_title="Engagement", 
                      xaxis=dict(type='category'))
    return fig


In [74]:
hashtag_count_data = analyze_hashtag_count_effect(video_info_df)  # Assuming your DataFrame is `video_info_df`
print(hashtag_count_data)  # Check the output structure


  hashtag_group     avg_views     avg_likes   avg_shares  avg_comments  count
0           1-2  1.513146e+06  69966.098341  1702.635664    608.510664   1688
1           3-5  8.923527e+05  37542.266991  1202.602242    323.879452  10079
2           6-8  6.249410e+05  23351.218760  1204.777473    229.420958  12740
3          9-12  6.089374e+05  29387.337809  1693.978222    242.005895   6107
4         13-15  5.930710e+05  29292.664762  1452.687619    208.536190   1050
5         16-20  4.856643e+05  21356.308320  1307.755302    165.474715    613
6         21-30  1.122650e+06  54483.259398  3605.364662    660.266917    266
7           30+  1.692547e+06  74074.216667  7690.166667   1049.766667     60


In [75]:
fig_hashtag_count = plot_hashtag_count_effect(hashtag_count_data)
fig_hashtag_count.show()






## Test

In [117]:
# Function to analyze hashtag count effect
def analyze_hashtag_count_effect(df):
    df['num_hashtags'] = df['hashtags'].apply(len)
    bins = [0, 2, 5, 8, 12, 15, 20, 30, float('inf')]
    labels = ['1-2', '3-5', '6-8', '9-12', '13-15', '16-20', '21-30', '30+']
    df['hashtag_group'] = pd.cut(df['num_hashtags'], bins=bins, labels=labels, right=False)

    hashtag_effect = df.groupby(['hashtag_group'], observed=True).agg(
        total_views=('statsV2.playCount', 'sum'),
        total_likes=('statsV2.diggCount', 'sum'),
        total_shares=('statsV2.shareCount', 'sum'),
        total_comments=('statsV2.commentCount', 'sum'),
        count=('hashtag_group', 'size')
    ).reset_index()
    
    return hashtag_effect

In [118]:
# Function to get top hashtags by group
def get_top_hashtags_by_group(df, top_n=5):
    hashtag_groups = df.groupby('hashtag_group')
    top_hashtags_list = []
    
    for group, group_df in hashtag_groups:
        hashtag_counts = {}
        for _, row in group_df.iterrows():
            for hashtag in row['hashtags']:
                if hashtag not in hashtag_counts:
                    hashtag_counts[hashtag] = {'total_views': 0, 'total_likes': 0, 'total_shares': 0, 'total_comments': 0}
                hashtag_counts[hashtag]['total_views'] += row['statsV2.playCount']
                hashtag_counts[hashtag]['total_likes'] += row['statsV2.diggCount']
                hashtag_counts[hashtag]['total_shares'] += row['statsV2.shareCount']
                hashtag_counts[hashtag]['total_comments'] += row['statsV2.commentCount']
        
        hashtag_df = pd.DataFrame.from_dict(hashtag_counts, orient='index').reset_index()
        hashtag_df.rename(columns={'index': 'hashtag'}, inplace=True)
        hashtag_df['hashtag_group'] = group

        top_hashtags_list.append(hashtag_df.nlargest(top_n, 'total_views'))
    
    return pd.concat(top_hashtags_list, ignore_index=True)

In [127]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_interactive_hashtag_analysis(hashtag_effect_df, top_hashtags_df):
    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=["Total Engagement by Hashtag Count Group", "Top Hashtags by Engagement Type"],
        shared_yaxes=False
    )

    # Engagement metrics and colors
    engagement_metrics = ['total_views', 'total_likes', 'total_shares', 'total_comments']
    colors = ['blue', 'red', 'green', 'orange']

    # 📌 First subplot: Total Engagement by Hashtag Count Group
    for metric, color in zip(engagement_metrics, colors):
        fig.add_trace(
            go.Bar(
                x=hashtag_effect_df['hashtag_group'],
                y=hashtag_effect_df[metric],
                name=metric.replace("total_", "Total ").title(),
                marker=dict(color=color),
                showlegend=True
            ),
            row=1, col=1
        )

    # 📌 Second subplot: Top Hashtags by Engagement Type (Sorted)
    for metric, color in zip(engagement_metrics, colors):
        sorted_df = top_hashtags_df.sort_values(by=metric, ascending=False)  # ✅ Sorting FIXED
        fig.add_trace(
            go.Bar(
                x=sorted_df['hashtag'],
                y=sorted_df[metric],
                name=metric.replace("total_", "Total ").title(),
                marker=dict(color=color),
                showlegend=True
            ),
            row=1, col=2
        )

    # ✅ Ensure sorting updates dynamically when filtering
    fig.update_layout(
        title_text="Hashtag Count vs Top Hashtags Engagement",
        barmode='group',
        legend_title="Engagement Type",
        xaxis_title="Hashtag Count Group",
        xaxis2_title="Top Hashtags",
        xaxis2=dict(categoryorder="total descending"),  # ✅ Dynamic sorting applied
        legend=dict(itemclick="toggle", itemdoubleclick="toggleothers")  # 🔥 Fully interactive legend
    )

    return fig


In [128]:
# Usage Example:
hashtag_effect_df = analyze_hashtag_count_effect(video_info_df)
top_hashtags_df = get_top_hashtags_by_group(video_info_df)
fig = plot_hashtag_count_with_top_hashtags(hashtag_effect_df, top_hashtags_df)
fig.show()



