In [1]:
!pip install plotly nbformat



In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ast

In [3]:
file_path = "../../data/processed/merged_data.csv"
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,CategoryType,author.commentSetting,author.downloadSetting,author.duetSetting,author.ftc,author.id,author.isADVirtual,author.isEmbedBanned,author.nickname,author.openFavorite,...,video.videoQuality,video.volumeInfo.Loudness,video.volumeInfo.Peak,video.width,collectTime,video.claInfo.captionsType,hashtags,num_hashtags,video_transcription,url
0,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,normal,-7.2,1.0,576,1741176061,,"1phutsaigon,saigon,saigondidau,cafesaigon,yenc...",7,cuối cùng mà mình thấy nhau và rồi tháng mấy đ...,https://www.tiktok.com/@7128234498731803674/vi...
1,105,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,normal,-8.3,1.0,576,1741176061,,"1phutsaigon,saigon,saigondidau,duxuan,tet2025",5,Nghe xuân sang thấy trong lòng mình chứa chan ...,https://www.tiktok.com/@7128234498731803674/vi...
2,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,normal,-7.0,1.0,576,1741176062,,"1phutsaigon,saigon,saigondidau,halacoffee,cafe...",6,bồi hồi liên kết sẽ quay về thăm quê em xuân đ...,https://www.tiktok.com/@7128234498731803674/vi...
3,111,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,normal,-10.4,1.0,576,1741176062,,"1phutsaigon,saigon,saigondidau,saigondamdang,s...",9,bánh chưng bánh giò bánh chưng tương lai mấy g...,https://www.tiktok.com/@7128234498731803674/vi...
4,105,0,0,0,False,7128234498731803674,False,False,1 phút Sài Gòn,False,...,normal,-14.8,0.51286,576,1741176063,,"1phutsaigon,saigondidau,langthangsaigon,tungng...",6,chờ mãi đến bây giờ thì anh mới nhận ra em mìn...,https://www.tiktok.com/@7128234498731803674/vi...


In [5]:
df.columns

Index(['CategoryType', 'author.commentSetting', 'author.downloadSetting',
       'author.duetSetting', 'author.ftc', 'author.id', 'author.isADVirtual',
       'author.isEmbedBanned', 'author.nickname', 'author.openFavorite',
       'author.privateAccount', 'author.relation', 'author.secUid',
       'author.secret', 'author.signature', 'author.stitchSetting',
       'author.uniqueId', 'author.verified', 'authorStats.diggCount',
       'authorStats.followerCount', 'authorStats.followingCount',
       'authorStats.friendCount', 'authorStats.heart',
       'authorStats.heartCount', 'authorStats.videoCount', 'collected',
       'createTime', 'desc', 'digged', 'diversificationId', 'duetDisplay',
       'duetEnabled', 'forFriend', 'id', 'isAd', 'itemCommentStatus',
       'item_control.can_repost', 'music.authorName', 'music.duration',
       'music.id', 'music.isCopyrighted', 'music.original', 'music.title',
       'officalItem', 'originalItem', 'privateItem', 'secret', 'shareEnabled',
     

# Video Performance (Views, Likes, Share, Repost)

In [6]:
# Check necessary columns
required_columns = ['video.duration', 'statsV2.playCount', 'statsV2.diggCount', 'statsV2.commentCount', 'statsV2.shareCount', 'CategoryType']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns: {missing_columns}")

In [7]:
# Ensure bins are unique and correctly categorized
max_duration = df['video.duration'].max()

bins = [0, 10, 30, 60, 120, 180, 240, 300, max_duration + 1]
labels = ['<10s', '10-30s', '30-60s', '1-2 mins', '2-3 mins', '3-4 mins', '4-5 mins', '>5 mins']

df['video_length_category'] = pd.cut(df['video.duration'], bins=bins, labels=labels, right=False)


In [8]:
# Insight 1.1: Views by Video Length Category
views_by_length = df.groupby('video_length_category')['statsV2.playCount'].mean().reset_index()
fig1 = px.bar(views_by_length, x='video_length_category', y='statsV2.playCount',
              title="Average Views by Video Length Category",
              labels={'statsV2.playCount': 'Average Views', 'video_length_category': 'Video Length Category'},
              color='video_length_category')
fig1.show()

  views_by_length = df.groupby('video_length_category')['statsV2.playCount'].mean().reset_index()


In [9]:
df['CategoryType']

0        111
1        105
2        111
3        111
4        105
        ... 
23094      0
23095    105
23096    111
23097    111
23098    111
Name: CategoryType, Length: 23099, dtype: int64

In [10]:
# Insight 1.2: Optimal Video Length by Industry
if 'CategoryType' in df.columns:
    industry_length = df.groupby(['CategoryType', 'video_length_category'])[['statsV2.playCount', 'statsV2.diggCount', 'statsV2.commentCount', 'statsV2.shareCount']].mean().reset_index()
    fig2 = px.bar(industry_length, x='CategoryType', y='statsV2.playCount', color='video_length_category',
                  title="Average Views by Industry and Video Length Category",
                  labels={'statsV2.playCount': 'Average Views', 'CategoryType': 'Industry', 'video_length_category': 'Video Length Category'},
                  barmode='group')
    fig2.show()





In [11]:
# Insight 1.3: Engagement Rate vs Video Length
for metric in ['statsV2.diggCount', 'statsV2.commentCount', 'statsV2.shareCount']:
    df[f'{metric}_rate'] = df[metric] / df['statsV2.playCount'] * 100

engagement_by_length = df.groupby('video_length_category')[[f'statsV2.diggCount_rate', f'statsV2.commentCount_rate', f'statsV2.shareCount_rate']].mean().reset_index()
fig3 = go.Figure()
for metric in ['statsV2.diggCount_rate', 'statsV2.commentCount_rate', 'statsV2.shareCount_rate']:
    fig3.add_trace(go.Bar(x=engagement_by_length['video_length_category'], y=engagement_by_length[metric], name=metric.replace('statsV2.', '').replace('_rate', '').capitalize()))
fig3.update_layout(title="Engagement Rate by Video Length Category", xaxis_title="Video Length Category", yaxis_title="Engagement Rate", barmode='group')
fig3.show()





# Music

In [12]:
# Insight 2.4: Do effects increase virality?
viral_threshold = df['statsV2.playCount'].quantile(0.90)
df['is_viral'] = df['statsV2.playCount'] > viral_threshold
effects_impact = df.groupby('stitchEnabled')['is_viral'].mean().reset_index()
fig7 = px.bar(effects_impact, x='stitchEnabled', y='is_viral',
              title="Effect Usage and Virality",
              labels={'stitchEnabled': 'Used Effects', 'is_viral': 'Proportion Viral'})
fig7.show()

In [13]:
# Insight 2.3: Does music tempo affect completion rate?
df['music_tempo'] = df['video.volumeInfo.Loudness'].apply(lambda x: 'Fast' if x > df['video.volumeInfo.Loudness'].median() else 'Slow')
completion_by_tempo = df.groupby('music_tempo')[['statsV2.playCount']].mean().reset_index()
fig6 = px.bar(completion_by_tempo, x='music_tempo', y='statsV2.playCount',
              title="Completion Rate by Music Tempo",
              labels={'music_tempo': 'Music Tempo', 'statsV2.playCount': 'Avg Views'})
fig6.show()

In [14]:
df['video.volumeInfo.Loudness']

0        -7.2
1        -8.3
2        -7.0
3       -10.4
4       -14.8
         ... 
23094   -17.3
23095   -17.9
23096   -17.9
23097   -13.8
23098   -13.5
Name: video.volumeInfo.Loudness, Length: 23099, dtype: float64

# Time Posting

In [15]:
df['createTime'] = pd.to_datetime(df['createTime'])  # If it's already in a recognizable datetime format

In [16]:
df['createTime']

0       1970-01-01 00:00:01.739455727
1       1970-01-01 00:00:01.738066138
2       1970-01-01 00:00:01.737296133
3       1970-01-01 00:00:01.736946020
4       1970-01-01 00:00:01.734006160
                     ...             
23094   1970-01-01 00:00:01.701865220
23095   1970-01-01 00:00:01.701517563
23096   1970-01-01 00:00:01.700912317
23097   1970-01-01 00:00:01.700652252
23098   1970-01-01 00:00:01.741177467
Name: createTime, Length: 23099, dtype: datetime64[ns]

In [17]:
# Insight 3.1: Views by Posting Time

df['hour'] = df['createTime'].dt.hour
time_bins = [(0, 12, 'Morning (0-12h)'), (12, 19, 'Afternoon (12-19h)'), (19, 24, 'Evening (19-24h)')]
df['time_category'] = pd.cut(df['hour'], bins=[0, 12, 19, 24], labels=['Morning', 'Afternoon', 'Evening'], right=False)

views_by_time = df.groupby('time_category', observed=False)['statsV2.playCount'].mean().reset_index()
fig4 = px.bar(views_by_time, x='time_category', y='statsV2.playCount',
              title="Average Views by Posting Time",
              labels={'statsV2.playCount': 'Average Views', 'time_category': 'Time of Day'},
              color='time_category')
fig4.show()

In [18]:
# Insight 3.2: Engagement on Weekends vs. Weekdays
df['day_of_week'] = df['createTime'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].apply(lambda x: 'Weekend' if x >= 4 else 'Weekday')

engagement_by_day = df.groupby('is_weekend', observed=False)[['statsV2.diggCount', 'statsV2.commentCount', 'statsV2.shareCount']].mean().reset_index()
fig5 = px.bar(engagement_by_day.melt(id_vars=['is_weekend'], var_name='Metric', value_name='Average Engagement'),
              x='is_weekend', y='Average Engagement', color='Metric',
              title="Engagement Comparison: Weekday vs. Weekend",
              barmode='group')
fig5.show()

# Hashtags

In [19]:
def convert_hashtags(value):
    if isinstance(value, str) and value.startswith("["):  # Ensure it's a string representation of a list
        try:
            return ast.literal_eval(value)  # Safely convert string to list
        except (ValueError, SyntaxError):
            return []  # If conversion fails, return an empty list
    return [] if pd.isna(value) else value  # Preserve actual lists, replace NaN with []

df['hashtags'] = df['hashtags'].apply(convert_hashtags)

In [20]:
df['hashtags']

0        1phutsaigon,saigon,saigondidau,cafesaigon,yenc...
1            1phutsaigon,saigon,saigondidau,duxuan,tet2025
2        1phutsaigon,saigon,saigondidau,halacoffee,cafe...
3        1phutsaigon,saigon,saigondidau,saigondamdang,s...
4        1phutsaigon,saigondidau,langthangsaigon,tungng...
                               ...                        
23094           hotteok,pulmuone,hotteokxucxich,hotteokngo
23095                                   detox,cleanhealthy
23096                                   dememoria,bodymist
23097                                                   []
23098                                  cheesecoffee,diudao
Name: hashtags, Length: 23099, dtype: object

In [21]:
trending_hashtags = {'fyp', 'viral', 'xyzbca', 'trending'}  # Add other trending hashtags if needed

df['has_trending_hashtag'] = df['hashtags'].apply(lambda tags: any(tag in trending_hashtags for tag in tags))
df['has_trending_hashtag']


0        False
1        False
2        False
3        False
4        False
         ...  
23094    False
23095    False
23096    False
23097    False
23098    False
Name: has_trending_hashtag, Length: 23099, dtype: bool

In [22]:
industry_hashtags = {'skincare', 'fashion', 'tech', 'food', 'education'}

df['has_industry_hashtag'] = df['hashtags'].apply(lambda tags: any(tag in industry_hashtags for tag in tags))
df['has_industry_hashtag'] 


0        False
1        False
2        False
3        False
4        False
         ...  
23094    False
23095    False
23096    False
23097    False
23098    False
Name: has_industry_hashtag, Length: 23099, dtype: bool

In [23]:
import plotly.express as px

# Visualization for Trending Hashtags
trending_stats = df.groupby('has_trending_hashtag')['statsV2.playCount'].mean().reset_index()
fig1 = px.bar(trending_stats, x='has_trending_hashtag', y='statsV2.playCount',
              title="Average Views for Videos with vs. without Trending Hashtags",
              labels={'has_trending_hashtag': 'Has Trending Hashtag?', 'statsV2.playCount': 'Average Views'},
              color='has_trending_hashtag')
fig1.show()
