## A. General Set up

In [None]:
%%shell
git clone --no-checkout --filter=blob:none --sparse --branch vmphat \
    https://github.com/vphuhan/21KHDL-TikTok-Analytics.git
cd 21KHDL-TikTok-Analytics
git sparse-checkout set --no-cone data/interim
git checkout

Cloning into '21KHDL-TikTok-Analytics'...
remote: Enumerating objects: 1151, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (91/91), done.[K
remote: Total 1151 (delta 90), reused 99 (delta 69), pack-reused 989 (from 3)[K
Receiving objects: 100% (1151/1151), 238.75 KiB | 1.16 MiB/s, done.
Resolving deltas: 100% (228/228), done.
remote: Enumerating objects: 7, done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 7 (from 2)[K
Receiving objects: 100% (7/7), 26.01 MiB | 6.90 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (7/7), done.
Your branch is up to date with 'origin/vmphat'.




In [1]:
#@title Import libraries
import os
from glob import glob

import pandas as pd
import numpy as np
import json
import re

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm
from collections import Counter

In [3]:
#@title Plotly theme
plotly_theme = "seaborn" # @param ['ggplot2', 'seaborn', 'simple_white', 'plotly', 'plotly_white', 'plotly_dark', 'presentation', 'xgridoff', 'ygridoff', 'gridon', 'none']

## B. Data Preprocessing

### user_info_df

In [4]:
#@title Drop NaN threshold
threshold = "80"  #@param ["100", "95", "90", "85", "80"]

In [5]:
# File paths
input_csv = "/content/21KHDL-TikTok-Analytics/data/interim/user_info.csv"
output_csv = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_user_info.csv"
user_info_df = pd.read_csv(input_csv)

display(user_info_df.head(5))
display(user_info_df.info())

# Drop NaN field
threshold_percentage = int(threshold)/100
cleaned_user_info_df = user_info_df.dropna(axis=1, thresh= threshold_percentage)
print(f"Removed {user_info_df.shape[1] - cleaned_user_info_df.shape[1]} columns that have more than {threshold}% NaN values")

#Check and remove duplicate
duplicates = cleaned_user_info_df[cleaned_user_info_df.duplicated()]
cleaned_user_info_df = cleaned_user_info_df.drop_duplicates()
print(f"Removed: {duplicates.shape[0]} duplicated sample.")

# Save the cleaned dataframe
cleaned_user_info_df.to_csv(output_csv, index=False)


Unnamed: 0,stats.diggCount,stats.followerCount,stats.followingCount,stats.friendCount,stats.heart,stats.heartCount,stats.videoCount,user.canExpPlaylist,user.commentSetting,user.commerceUserInfo.commerceUser,...,user.bioLink.link,user.bioLink.risk,user.commerceUserInfo.category,user.commerceUserInfo.categoryButton,user.commerceUserInfo.downLoadLink.android,user.commerceUserInfo.downLoadLink.ios,user.profileTab.showQuestionTab,user.profileTab.showMusicTab,user.uniqueIdModifyTime,user.roomId
0,0,198700,47,28,4100000,4100000,847,True,0,False,...,,,,,,,,,,
1,0,637500,185,61,13300000,13300000,3756,True,0,True,...,Anchoivungtau.vn,3.0,Travel & Tourism,False,,,True,,,
2,0,183500,0,0,4700000,4700000,872,True,0,False,...,,,,,,,,,,
3,0,404000,75,57,14200000,14200000,566,True,0,True,...,,,Food & Beverage,False,,,,,,
4,0,1300000,68,9,17100000,17100000,218,True,0,True,...,anhdaubep.vn,3.0,Food & Beverage,False,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 41 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   stats.diggCount                             109 non-null    int64  
 1   stats.followerCount                         109 non-null    int64  
 2   stats.followingCount                        109 non-null    int64  
 3   stats.friendCount                           109 non-null    int64  
 4   stats.heart                                 109 non-null    int64  
 5   stats.heartCount                            109 non-null    int64  
 6   stats.videoCount                            109 non-null    int64  
 7   user.canExpPlaylist                         109 non-null    bool   
 8   user.commentSetting                         109 non-null    int64  
 9   user.commerceUserInfo.commerceUser          109 non-null    bool   
 10  user.downloadS

None

Removed 2 columns that have more than 80% NaN values
Removed: 0 duplicated sample.


### video_info_df

In [6]:
#@title Drop NaN threshold
threshold = "80"  #@param ["100", "95", "90", "85", "80"]

In [7]:
# Import necessary libraries
import pandas as pd
import re  # Ensure 're' is imported before usage

# Function to extract hashtags from text
def extract_hashtags(text):
    """Extract hashtags from the given text."""
    if pd.isna(text):  # Handle missing values
        return ""
    hashtags = re.findall(r"#\w+", text)  # Extract hashtags
    return " ".join(hashtags)  # Return hashtags as a space-separated string

# Define the required columns
selected_columns = [
    "author.uniqueId", "author.verified", "author.privateAccount",
    "authorStats.followerCount", "authorStats.heartCount", "authorStats.videoCount",
    "stats.playCount", "stats.diggCount", "stats.commentCount", "stats.shareCount",
    "createTime", "desc",
    "video.duration", "video.definition", "video.videoQuality",
    "video.width", "video.height", "video.bitrate", "video.codecType", "video.format",
    "video.claInfo.enableAutoCaption", "video.claInfo.hasOriginalAudio",
    "music.title", "music.authorName", "music.isCopyrighted"
]

# File paths
input_csv = "/content/21KHDL-TikTok-Analytics/data/interim/video_info.csv"
output_csv = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_video_info.csv"
video_info_df = pd.read_csv(input_csv, usecols=selected_columns)

display(video_info_df.head(5))
display(video_info_df.info())

# Drop NaN field
threshold_percentage = int(threshold)/100
cleaned_video_info_df = video_info_df.dropna(axis=1, thresh= threshold_percentage)
print(f"Removed {video_info_df.shape[1] - cleaned_video_info_df.shape[1]} columns that have more than {threshold}% NaN values")

#Check and remove duplicate
duplicates = cleaned_video_info_df[cleaned_video_info_df.duplicated()]
cleaned_video_info_df = cleaned_video_info_df.drop_duplicates()
print(f"Removed: {duplicates.shape[0]} duplicated sample.")

# Apply the function to create a new 'hashtags' column
cleaned_video_info_df["hashtags"] = cleaned_video_info_df["desc"].apply(extract_hashtags)

# Save the cleaned dataframe
cleaned_video_info_df.to_csv(output_csv, index=False)


Unnamed: 0,author.privateAccount,author.uniqueId,author.verified,authorStats.followerCount,authorStats.heartCount,authorStats.videoCount,createTime,desc,music.authorName,music.isCopyrighted,...,video.bitrate,video.claInfo.enableAutoCaption,video.claInfo.hasOriginalAudio,video.codecType,video.definition,video.duration,video.format,video.height,video.videoQuality,video.width
0,False,1phutsaigon,False,198700,4100000,847,1741097297,Kem chiên ký tuổi thơ mãi ngon #1phutsaigon #s...,1 phút Sài Gòn,True,...,1521668.0,True,True,h264,540p,22,mp4,1024,normal,576
1,False,1phutsaigon,False,198700,4100000,847,1741010855,Cư dân quận 2 nhất định phải ghé quán cafe đẹp...,1 phút Sài Gòn,True,...,1455659.0,True,True,h264,540p,28,mp4,1024,normal,576
2,False,1phutsaigon,False,198700,4100000,847,1740804900,Quán cafe hot hit nhất nhì Sài Gòn nay có thêm...,1 phút Sài Gòn,False,...,3243905.0,True,True,h264,540p,23,mp4,1024,normal,576
3,False,1phutsaigon,False,198700,4100000,847,1740748537,"Triển lãm Inter-be, Inter-are (Ta có trong nha...",1 phút Sài Gòn,False,...,905530.0,True,True,h264,540p,30,mp4,1024,normal,576
4,False,1phutsaigon,False,198700,4100000,847,1740665813,Bao lâu rồi ta chưa đón hoàng hôn cùng nhau? #...,1 phút Sài Gòn,False,...,2853481.0,True,True,h264,540p,23,mp4,1024,normal,576


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32603 entries, 0 to 32602
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   author.privateAccount            32603 non-null  bool   
 1   author.uniqueId                  32603 non-null  object 
 2   author.verified                  32603 non-null  bool   
 3   authorStats.followerCount        32603 non-null  int64  
 4   authorStats.heartCount           32603 non-null  int64  
 5   authorStats.videoCount           32603 non-null  int64  
 6   createTime                       32603 non-null  int64  
 7   desc                             32522 non-null  object 
 8   music.authorName                 32566 non-null  object 
 9   music.isCopyrighted              32603 non-null  bool   
 10  music.title                      32571 non-null  object 
 11  stats.commentCount               32603 non-null  int64  
 12  stats.diggCount   

None

Removed 0 columns that have more than 80% NaN values
Removed: 0 duplicated sample.


## C. Dashboard Deployment

In [None]:
#@title Install
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m120.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_6

In [9]:
ngrok_config_add_authtoken = "ngrok config add-authtoken 2uGReNm0gnN6Rhmn7HoHK5T9Dnr_7QTMEqwR72SZPtatMeLCr" # @param {"type":"string","placeholder":"Ex: ngrok config add-authtoken 2uGReNm0gnN6Rhmn7Ho"}
!{ngrok_config_add_authtoken}

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [6]:
%%writefile data_loader.py
import pandas as pd

def load_data():
    cleaned_user_csv_file = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_user_info.csv"
    cleaned_video_csv_file = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_video_info.csv"
    cleaned_user_info_df = pd.read_csv(cleaned_user_csv_file)
    cleaned_video_info_df = pd.read_csv(cleaned_video_csv_file)
    cleaned_video_info_df['createTime'] = pd.to_datetime(cleaned_video_info_df['createTime'], unit='s')
    return cleaned_user_info_df, cleaned_video_info_df

Writing data_loader.py


In [7]:
%%writefile styles.py
import streamlit as st

def apply_styles():
    st.markdown("""
        <style>
        .main-title {
            font-size: 28px;
            font-weight: bold;
            color: #FFFFFF;
            text-align: center;
            padding-bottom: 20px;
        }
        .subheader {
            font-size: 20px;
            font-weight: bold;
            color: #E0E0E0;
            padding-top: 10px;
        }
        .stButton>button {
            background-color: #4CAF50;
            color: white;
            border-radius: 5px;
        }
        .stExpander {
            border-radius: 10px;
            padding: 10px;
        }

        </style>
    """, unsafe_allow_html=True)

def personal_styles():
    st.markdown("""
        <style>
        h1, h2, h3 { color: #1f2a44; font-family: 'Helvetica', sans-serif; }
        .stMetric { border-radius: 8px; padding: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        </style>
    """, unsafe_allow_html=True)

def hashtag_song_styles():
    st.markdown("""
        <style>
        h1, h3 { color: #2c3e50; font-family: 'Arial', sans-serif; }
        </style>
    """, unsafe_allow_html=True)

Writing styles.py


In [12]:
!mkdir pages

In [8]:
%%writefile pages/correlation_analysis.py
import streamlit as st
import plotly.express as px

def correlation_analysis(cleaned_user_info_df):
    tab1, tab2, tab3 = st.tabs(["📊 Correlation Analysis", "🏆 Top Users", "🔥 Engagement Insights"])

    # Tab 1: Correlation Analysis
    with tab1:
        st.header("📊 Correlation Analysis")
        st.write("Analyze relationships between followers, likes, and videos.")
        colors_rgba = ["rgba(99, 110, 250, 0.9)", "rgba(239, 85, 59, 0.8)", "rgba(0, 204, 150, 0.7)"]
        st.markdown('<div class="main-title">📊 TikTok User Analysis Dashboard</div>', unsafe_allow_html=True)
        st.markdown("### ⚙️ Customize Your Analysis")
        col_input1, col_input2, col_input3 = st.columns([2, 1, 1])
        with col_input1:
            chart_option = st.selectbox("Select a Visualization", ["Distribution of Followers", "Distribution of Likes", "Distribution of Video Count", "All in One", "Scatter Matrix"])
        with col_input2:
            num_bins = st.slider("Number of Bins", 10, 100, 50, 5)
        with col_input3:
            log_scale = st.checkbox("Logarithmic Scale", value=True)

        col1, col2 = st.columns([3, 2])
        with col1:
            with st.spinner(f"Rendering {chart_option.lower()}..."):
                if chart_option == "Distribution of Followers":
                    st.markdown('<div class="subheader">📈 Followers Distribution</div>', unsafe_allow_html=True)
                    fig = px.histogram(cleaned_user_info_df, x='stats.followerCount', nbins=num_bins, log_y=log_scale, color_discrete_sequence=[colors_rgba[0]], height=500, marginal="box")
                    fig.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig, use_container_width=True)
                elif chart_option == "Distribution of Likes":
                    st.markdown('<div class="subheader">❤️ Likes Distribution</div>', unsafe_allow_html=True)
                    fig = px.histogram(cleaned_user_info_df, x='stats.heart', nbins=num_bins, log_y=log_scale, color_discrete_sequence=[colors_rgba[1]], height=500, marginal="box")
                    fig.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig, use_container_width=True)
                elif chart_option == "Distribution of Video Count":
                    st.markdown('<div class="subheader">🎬 Video Count Distribution</div>', unsafe_allow_html=True)
                    fig = px.histogram(cleaned_user_info_df, x='stats.videoCount', nbins=num_bins, log_y=log_scale, color_discrete_sequence=[colors_rgba[2]], height=500, marginal="box")
                    fig.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig, use_container_width=True)
                elif chart_option == "All in One":
                    st.markdown('<div class="subheader">📊 Combined Distribution</div>', unsafe_allow_html=True)
                    melted_df = cleaned_user_info_df.melt(value_vars=['stats.followerCount', 'stats.heart', 'stats.videoCount'], var_name="Metric", value_name="Count")
                    metric_names = {"stats.followerCount": "Followers", "stats.heart": "Likes", "stats.videoCount": "Videos"}
                    melted_df["Metric"] = melted_df["Metric"].map(metric_names)
                    fig = px.histogram(melted_df, x="Count", color="Metric", log_y=log_scale, nbins=num_bins, color_discrete_map={"Followers": colors_rgba[0], "Likes": colors_rgba[1], "Videos": colors_rgba[2]}, height=500, opacity=0.7, barmode="overlay")
                    fig.update_layout(template="plotly_dark", bargap=0.1, legend_title_text="Metrics")
                    st.plotly_chart(fig, use_container_width=True)
                elif chart_option == "Scatter Matrix":
                    st.markdown('<div class="subheader">🔍 Scatter Matrix</div>', unsafe_allow_html=True)
                    fig = px.scatter_matrix(cleaned_user_info_df, dimensions=['stats.followerCount', 'stats.heart', 'stats.videoCount'], color_discrete_sequence=colors_rgba, height=600, opacity=0.6)
                    fig.update_traces(diagonal_visible=False)
                    fig.update_layout(template="plotly_dark")
                    st.plotly_chart(fig, use_container_width=True)
            data_to_download = cleaned_user_info_df[['stats.followerCount', 'stats.heart', 'stats.videoCount']]
            st.download_button(label="📥 Download Chart Data", data=data_to_download.to_csv(index=False), file_name=f"{chart_option.lower().replace(' ', '_')}.csv", mime="text/csv")
        with col2:
            with st.expander("📋 Quick Stats", expanded=True):
                st.markdown("#### Summary")
                st.write(f"**Total Users:** {len(cleaned_user_info_df):,}")
                st.write(f"**Avg. Followers:** {cleaned_user_info_df['stats.followerCount'].mean():,.0f}")
                st.write(f"**Avg. Likes:** {cleaned_user_info_df['stats.heart'].mean():,.0f}")
                st.write(f"**Avg. Videos:** {cleaned_user_info_df['stats.videoCount'].mean():,.0f}")
            st.markdown('<div class="subheader">📊 Correlation Heatmap</div>', unsafe_allow_html=True)
            correlation_matrix = cleaned_user_info_df[['stats.followerCount', 'stats.heart', 'stats.videoCount']].corr()
            fig = px.imshow(correlation_matrix, text_auto=".2f", aspect="equal", color_continuous_scale="Blues", labels=dict(x="Metrics", y="Metrics", color="Correlation"), height=400)
            fig.update_layout(xaxis=dict(tickvals=[0, 1, 2], ticktext=["Followers", "Likes", "Videos"]), yaxis=dict(tickvals=[0, 1, 2], ticktext=["Followers", "Likes", "Videos"]))
            st.plotly_chart(fig, use_container_width=True)

    # Tab 2: Top Users
    with tab2:
        st.header("🏆 Top Users")
        st.write("View the most influential TikTok users.")
        st.markdown('<div class="main-title" style="font-size: 28px; font-weight: bold; color: #1E90FF;">🏆 Top Users Analysis</div>', unsafe_allow_html=True)
        st.markdown('<p style="color: #666;">Explore the top-performing users based on engagement metrics like likes and video counts.</p>', unsafe_allow_html=True)
        col1, col2, col3 = st.columns([1, 1, 1])
        with col1:
            top_n = st.slider("Select Top N Users", 1, 50, 10, 1)
        with col2:
            chart_option = st.selectbox("Select Chart Type", ["Most Likes", "Most Videos", "Most Followers", "Engagement Rate"])
        with col3:
            sort_order = st.radio("Sort Order", ["Descending", "Ascending"], index=0)
        tab_bar, tab_pie = st.tabs(["Bar Chart", "Pie Chart"])
        with st.spinner(f"Generating visuals for Top {top_n} Users..."):
            if chart_option == "Most Likes":
                metric, title, color_scale, y_label = 'stats.heart', f"Top {top_n} Users with Most Likes", 'reds', "Total Likes (Hearts)"
            elif chart_option == "Most Videos":
                metric, title, color_scale, y_label = 'stats.videoCount', f"Top {top_n} Users with Most Videos", 'greens', "Total Videos"
            elif chart_option == "Most Followers":
                metric, title, color_scale, y_label = 'stats.followerCount', f"Top {top_n} Users with Most Followers", 'blues', "Total Followers"
            else:
                cleaned_user_info_df['engagement_rate'] = cleaned_user_info_df['stats.heart'] / cleaned_user_info_df['stats.videoCount'].replace(0, 1)
                metric, title, color_scale, y_label = 'engagement_rate', f"Top {top_n} Users by Engagement Rate (Likes/Video)", 'purples', "Engagement Rate"
            top_data = (cleaned_user_info_df.nlargest(top_n, metric) if sort_order == "Descending" else cleaned_user_info_df.nsmallest(top_n, metric))[['user.uniqueId', metric]]
            with tab_bar:
                st.markdown(f'<div class="subheader" style="color: #333; font-weight: bold;">{title}</div>', unsafe_allow_html=True)
                fig = px.bar(top_data, x='user.uniqueId', y=metric, color=metric, color_continuous_scale=color_scale, text=top_data[metric].apply(lambda x: f'{x:,.1f}' if metric == 'engagement_rate' else f'{x:,}'), height=500)
                fig.update_traces(textposition='outside', textfont_size=12)
                fig.update_layout(xaxis_title="User ID", yaxis_title=y_label, template="plotly_white", xaxis_tickangle=-45, showlegend=False, margin=dict(t=50, b=50), plot_bgcolor="rgba(0,0,0,0)", paper_bgcolor="rgba(0,0,0,0)")
                st.plotly_chart(fig, use_container_width=True)
            with tab_pie:
                st.markdown(f'<div class="subheader" style="color: #333; font-weight: bold;">Distribution of {chart_option}</div>', unsafe_allow_html=True)
                fig = px.pie(top_data, names='user.uniqueId', values=metric, color_discrete_sequence=px.colors.sequential.RdBu, height=500)
                fig.update_traces(textinfo='percent+label', pull=[0.1] + [0]*(top_n-1))
                fig.update_layout(template="plotly_white", margin=dict(t=50, b=50), plot_bgcolor="rgba(0,0,0,0)", paper_bgcolor="rgba(0,0,0,0)")
                st.plotly_chart(fig, use_container_width=True)
        with st.expander("View Detailed Data", expanded=False):
            st.dataframe(top_data.style.format({metric: "{:,.2f}" if metric == 'engagement_rate' else "{:,}"}), use_container_width=True)
        csv = top_data.to_csv(index=False)
        st.download_button(label="📥 Download Data as CSV", data=csv, file_name=f"top_{top_n}_{chart_option.lower().replace(' ', '_')}_{sort_order.lower()}.csv", mime="text/csv")

    # Tab 3: Engagement Insights
    with tab3:
        st.header("🔥 Engagement Insights")
        st.write("Understand how engagement levels vary across different user segments.")
        st.markdown('<h2 style="text-align:center;">🏆 Engagement Analysis</h2>', unsafe_allow_html=True)
        st.write("Analyze the relationship between **followers** and **engagement (likes/hearts)**.")
        col1, col2, col3 = st.columns([1, 1, 2])
        with col1:
            follower_level = st.selectbox("📌 Select Follower Level:", ["Low", "Average", "High"])
        with col2:
            engagement_level = st.selectbox("🔥 Select Engagement Level:", ["Low", "Average", "High"])
        with col3:
            plotly_theme = st.selectbox("🎨 Choose Theme:", ["plotly_dark", "seaborn", "ggplot2", "plotly_white"])
        cleaned_user_info_df['engagement_ratio'] = cleaned_user_info_df['stats.heart'] / cleaned_user_info_df['stats.followerCount'].replace(0, 1)
        percentiles = [0, 0.33, 0.66, 1]
        low_followers, high_followers = cleaned_user_info_df['stats.followerCount'].quantile(percentiles[1]), cleaned_user_info_df['stats.followerCount'].quantile(percentiles[2])
        low_engagement, high_engagement = cleaned_user_info_df['engagement_ratio'].quantile(percentiles[1]), cleaned_user_info_df['engagement_ratio'].quantile(percentiles[2])
        if follower_level == "Low":
            filtered_df = cleaned_user_info_df[cleaned_user_info_df['stats.followerCount'] <= low_followers]
        elif follower_level == "Average":
            filtered_df = cleaned_user_info_df[(cleaned_user_info_df['stats.followerCount'] > low_followers) & (cleaned_user_info_df['stats.followerCount'] <= high_followers)]
        else:
            filtered_df = cleaned_user_info_df[cleaned_user_info_df['stats.followerCount'] > high_followers]
        if engagement_level == "Low":
            filtered_df = filtered_df[filtered_df['engagement_ratio'] <= low_engagement]
        elif engagement_level == "Average":
            filtered_df = filtered_df[(filtered_df['engagement_ratio'] > low_engagement) & (filtered_df['engagement_ratio'] <= high_engagement)]
        else:
            filtered_df = filtered_df[filtered_df['engagement_ratio'] > high_engagement]
        with st.spinner("📊 Rendering engagement analysis..."):
            fig = px.scatter(filtered_df, x='stats.followerCount', y='stats.heart', size='stats.followerCount', color='engagement_ratio', color_continuous_scale='viridis', hover_data=['user.uniqueId'], height=600, opacity=0.75)
            fig.update_layout(xaxis_title="👥 Follower Count", yaxis_title="❤️ Total Likes", template=plotly_theme, xaxis_type="log", yaxis_type="log", showlegend=True, coloraxis_colorbar_title="Engagement Ratio 🔥")
            st.markdown(f'<h3>📊 Engagement Insights: {follower_level} Followers & {engagement_level} Engagement</h3>', unsafe_allow_html=True)
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("📌 Number of Users", f"{len(filtered_df):,}")
            with col2:
                st.metric("❤️ Avg. Likes", f"{filtered_df['stats.heart'].mean():,.0f}")
            with col3:
                st.metric("👥 Avg. Followers", f"{filtered_df['stats.followerCount'].mean():,.0f}")
            st.plotly_chart(fig, use_container_width=True)
        st.markdown("""
        **🔍 Key Takeaways:**
        - Higher engagement doesn't always come from high followers!
        - Some small creators can outperform big ones in engagement.
        - Use trending hashtags and sounds to boost visibility!
        """)

Writing pages/correlation_analysis.py


In [9]:
%%writefile pages/personal_analysis.py
import streamlit as st
import plotly.express as px
import pandas as pd
from styles import personal_styles

def personal_analysis(cleaned_video_info_df):
    personal_styles()
    tiktoker_options = cleaned_video_info_df['author.uniqueId'].unique()
    min_date = cleaned_video_info_df['createTime'].min().date()
    max_date = cleaned_video_info_df['createTime'].max().date()

    with st.sidebar:
        st.title("📊 TikTok Analytics")
        st.markdown("Analyze TikTok trends")
        selected_tiktoker = st.selectbox("👤 Select TikToker", tiktoker_options)
        date_range = st.slider("📅 Date Range", min_value=min_date, max_value=max_date, value=(min_date, max_date), format="MM/DD/YYYY")
        start_date, end_date = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1])
        if st.button("🔄 Reset"):
            start_date, end_date = pd.to_datetime(min_date), pd.to_datetime(max_date)

    st.header(f"@{selected_tiktoker}'s Analytics")
    tiktoker_data = cleaned_video_info_df[cleaned_video_info_df['author.uniqueId'] == selected_tiktoker]

    if not tiktoker_data.empty:
        user_info = tiktoker_data.iloc[0]
        with st.container():
            st.subheader("Profile Overview")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Username", user_info['author.uniqueId'])
                st.metric("Followers", f"{user_info['authorStats.followerCount']:,}")
            with col2:
                st.metric("Commerce", user_info.get('user.commerceUserInfo.category', 'No info'))
                st.metric("Total Likes", f"{user_info['authorStats.heartCount']:,}")
            with col3:
                st.metric("Verified", "Yes ✅" if user_info['author.verified'] else "No ❌")
                st.metric("Total Videos", f"{user_info['authorStats.videoCount']:,}")

        with st.spinner("Loading data..."):
            filtered_data = tiktoker_data[(tiktoker_data['createTime'] >= start_date) & (tiktoker_data['createTime'] <= end_date)]

        if not filtered_data.empty:
            with st.expander("📈 Video Trends", expanded=True):
                video_counts = filtered_data.groupby(filtered_data['createTime'].dt.date).size().reset_index(name='Video Count')
                fig = px.area(video_counts, x='createTime', y='Video Count', title="Video Creation Over Time", template="plotly_white")
                fig.update_traces(line=dict(color="#00b4d8", width=2), fill='tozeroy')
                fig.add_scatter(x=video_counts['createTime'], y=video_counts['Video Count'], mode='markers', marker=dict(size=8, color="#00b4d8"))
                max_day = video_counts.loc[video_counts['Video Count'].idxmax()]
                fig.add_annotation(x=max_day['createTime'], y=max_day['Video Count'], text=f"Peak: {max_day['Video Count']}", showarrow=True, arrowhead=1)
                fig.update_layout(xaxis_title="Date", yaxis_title="Videos Posted", showlegend=False)
                st.plotly_chart(fig, use_container_width=True)

            with st.expander("🎵 Music Usage"):
                music_counts = filtered_data['music.authorName'].value_counts().head(10).reset_index()
                music_counts.columns = ['Music Author', 'Count']
                fig = px.bar(music_counts, x='Count', y='Music Author', orientation='h', title="Top 10 Music Choices", color='Count', color_continuous_scale='magma')
                fig.update_layout(xaxis_title="Times Used", yaxis_title="", showlegend=False)
                st.plotly_chart(fig, use_container_width=True)

            with st.expander("🏷️ Hashtag Usage"):
                all_hashtags = filtered_data['hashtags'].dropna().str.split().explode()
                if not all_hashtags.empty:
                    hashtag_counts = all_hashtags.value_counts().head(10).reset_index()
                    hashtag_counts.columns = ['Hashtag', 'Count']
                    fig = px.treemap(hashtag_counts, path=['Hashtag'], values='Count', title="Top 10 Hashtags", color='Count', color_continuous_scale='viridis')
                    fig.update_layout(margin=dict(t=50, l=0, r=0, b=0))
                    st.plotly_chart(fig, use_container_width=True)
                else:
                    st.markdown('<p style="color:#3498db;">ℹ️ No hashtags available.</p>', unsafe_allow_html=True)
        else:
            st.markdown(f'<p style="color:#e67e22;">⚠️ No video data for {selected_tiktoker} in this range.</p>', unsafe_allow_html=True)
    else:
        st.markdown(f'<p style="color:#c0392b;">❌ No data for {selected_tiktoker}.</p>', unsafe_allow_html=True)

Writing pages/personal_analysis.py


In [10]:
%%writefile pages/hashtag_song_analysis.py
import streamlit as st
import plotly.express as px
from styles import hashtag_song_styles
import pandas as pd

def hashtag_song_analysis(cleaned_video_info_df):
    hashtag_song_styles()
    min_date = cleaned_video_info_df['createTime'].min().date()
    max_date = cleaned_video_info_df['createTime'].max().date()

    with st.sidebar:
        st.title("🔍 Filters")
        date_range = st.slider("📅 Date Range", min_value=min_date, max_value=max_date, value=(min_date, max_date), format="YYYY-MM-DD")
        start_date, end_date = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1])
        top_k = st.number_input("🔢 Top K", min_value=1, max_value=50, value=10, step=1)

    st.title("🔍 Hashtag & Song Analysis")
    with st.spinner("Analyzing data..."):
        filtered_data = cleaned_video_info_df[(cleaned_video_info_df['createTime'] >= start_date) & (cleaned_video_info_df['createTime'] <= end_date)]

    if not filtered_data.empty:
        with st.expander("🔥 Most Used Hashtags", expanded=True):
            all_hashtags = filtered_data['hashtags'].dropna().str.split().explode()
            if not all_hashtags.empty:
                hashtag_counts = all_hashtags.value_counts().reset_index()
                hashtag_counts.columns = ['Hashtag', 'Count']
                top_hashtags = hashtag_counts.head(top_k)
                fig = px.bar(top_hashtags, x='Count', y='Hashtag', orientation='h', title="📌 Most Used Hashtags", color='Count', color_continuous_scale='viridis')
                fig.update_layout(xaxis_title="Usage Count", yaxis_title="Hashtag", template="plotly_white")
                st.plotly_chart(fig, use_container_width=True)
                if st.checkbox("Show Hashtag Table"):
                    st.dataframe(top_hashtags)
            else:
                st.markdown('<p style="color:#e74c3c;">⚠️ No hashtags found.</p>', unsafe_allow_html=True)

        with st.expander("🎵 Most Used Songs", expanded=True):
            music_counts = filtered_data['music.authorName'].value_counts().reset_index()
            music_counts.columns = ['Music Author', 'Count']
            top_music = music_counts.head(top_k)
            if not top_music.empty:
                fig = px.bar(top_music, x='Count', y='Music Author', orientation='h', title="🎶 Most Used Songs", color='Count', color_continuous_scale='viridis')
                fig.update_layout(xaxis_title="Usage Count", yaxis_title="Music Author", template="plotly_white")
                st.plotly_chart(fig, use_container_width=True)
                if st.checkbox("Show Song Table"):
                    st.dataframe(top_music)
            else:
                st.markdown('<p style="color:#e74c3c;">⚠️ No songs found.</p>', unsafe_allow_html=True)
    else:
        st.markdown('<p style="color:#e74c3c;">⚠️ No data available for this range.</p>', unsafe_allow_html=True)

Writing pages/hashtag_song_analysis.py


In [11]:
%%writefile footer.py
import streamlit as st
from datetime import datetime
import pytz

def display_footer():
    vn_timezone = pytz.timezone("Asia/Ho_Chi_Minh")
    vn_time = datetime.now(vn_timezone).strftime("%Y-%m-%d %H:%M:%S")
    st.markdown(f'''
        <div style="position: absolute; bottom: 10px; right: 20px; margin: -20px; color: #999; font-size: 18px;">
            🕒 Last updated: {vn_time}
        </div>
    ''', unsafe_allow_html=True)

Writing footer.py


In [12]:
%%writefile app.py
import streamlit as st
from styles import apply_styles
from data_loader import load_data
from pages.correlation_analysis import correlation_analysis
from pages.personal_analysis import personal_analysis
from pages.hashtag_song_analysis import hashtag_song_analysis
from footer import display_footer


# Apply global styles
apply_styles()

# Load data
cleaned_user_info_df, cleaned_video_info_df = load_data()

# Sidebar navigation
st.sidebar.title("📊 TikTok Analysis Dashboard")
page = st.sidebar.radio("Select Page", ["Correlation Analysis", "Personal Analysis", "Hashtag & Song Analysis"])

# Page routing
if page == "Correlation Analysis":
    correlation_analysis(cleaned_user_info_df)
elif page == "Personal Analysis":
    personal_analysis(cleaned_video_info_df)
elif page == "Hashtag & Song Analysis":
    hashtag_song_analysis(cleaned_video_info_df)

st.write(f"Selected page: {page}")
logger.debug("Minimal app rendered.")
# Display footer
display_footer()

Writing app.py


In [18]:
!streamlit run app.py &> logs.txt &


In [26]:
from pyngrok import ngrok

ngrok.kill()  # Kill previous tunnels if needed

# Correct syntax to start the tunnel
public_url = ngrok.connect(8501, "http")  # Specify "http" as the protocol
print("Streamlit App URL:", public_url)

Streamlit App URL: NgrokTunnel: "https://c02c-34-125-17-100.ngrok-free.app" -> "http://localhost:8501"


In [27]:
ngrok.kill()  # Kill previous tunnels if needed

## D. Testing

In [21]:
%%writefile app2.py
import streamlit as st
import pandas as pd
from datetime import datetime
import plotly.express as px
import pytz

# Custom CSS for improved styling
st.markdown("""
    <style>
    .main-title {
        font-size: 28px;
        font-weight: bold;
        color: #FFFFFF;
        text-align: center;
        padding-bottom: 20px;
    }
    .subheader {
        font-size: 20px;
        font-weight: bold;
        color: #E0E0E0;
        padding-top: 10px;
    }
    .stButton>button {
        background-color: #4CAF50;
        color: white;
        border-radius: 5px;
    }
    .stExpander {

        border-radius: 10px;
        padding: 10px;
    }
    </style>
""", unsafe_allow_html=True)

# Set custom color palette
colors = ["#636EFA", "#EF553B", "#00CC96"]  # Blue, Red, Green
plotly_theme = "plotly_white"  # Choose a Plotly theme

# Load dataset
cleaned_user_csv_file = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_user_info.csv"  # Adjust path
cleaned_user_info_df = pd.read_csv(cleaned_user_csv_file)
cleaned_video_csv_file = "/content/21KHDL-TikTok-Analytics/data/interim/cleaned_video_info.csv"
cleaned_video_info_df = pd.read_csv(cleaned_video_csv_file)
# Sidebar Navigation
st.sidebar.title("📊 TikTok Analysis Dashboard")
page = st.sidebar.radio("Select Page", ["Correlation Analysis", "Top Users statistic", "Engagement Analysis", "Personal Analysis", "Hashtag & Song Analysis"])



# ---------------- PAGE 1: Overview ----------------
page = st.sidebar.radio("Select Page", ["Correlation Analysis", "Personal Analysis", "Hashtag & Song Analysis"])

# ---------------- PAGE 1: Correlation Analysis with Tabs ----------------
if page == "Correlation Analysis":
    # Create Tabs
    tab1, tab2, tab3 = st.tabs(["📊 Correlation Analysis", "🏆 Top Users", "🔥 Engagement Insights"])

    # Tab 1: Correlation Analysis
    with tab1:
        st.header("📊 Correlation Analysis")
        st.write("Analyze relationships between followers, likes, and videos.")
        # Define colors with transparency (RGBA format)
        colors_rgba = ["rgba(99, 110, 250, 0.9)",  # Blue (Followers)
                       "rgba(239, 85, 59, 0.8)",   # Red (Likes)
                       "rgba(0, 204, 150, 0.7)"]   # Green (Videos)

        # Main title with emoji and centered styling
        st.markdown('<div class="main-title">📊 TikTok User Analysis Dashboard</div>', unsafe_allow_html=True)

        # Input section with improved layout
        st.markdown("### ⚙️ Customize Your Analysis")
        col_input1, col_input2, col_input3 = st.columns([2, 1, 1])
        with col_input1:
            chart_option = st.selectbox(
                "Select a Visualization",
                ["Distribution of Followers", "Distribution of Likes", "Distribution of Video Count", "All in One", "Scatter Matrix"],
                help="Choose a metric or explore combined views."
            )
        with col_input2:
            num_bins = st.slider("Number of Bins", min_value=10, max_value=100, value=50, step=5,
                                 help="Adjust histogram granularity.")
        with col_input3:
            log_scale = st.checkbox("Logarithmic Scale", value=True, help="Toggle logarithmic Y-axis.")

        # Layout for charts and sidebar stats
        col1, col2 = st.columns([3, 2])

        # Chart rendering with progress indicator
        with col1:
            with st.spinner(f"Rendering {chart_option.lower()}..."):
                if chart_option == "Distribution of Followers":
                    st.markdown('<div class="subheader">📈 Followers Distribution</div>', unsafe_allow_html=True)
                    fig_followers = px.histogram(
                        cleaned_user_info_df,
                        x='stats.followerCount',
                        nbins=num_bins,
                        log_y=log_scale,
                        labels={'stats.followerCount': 'Follower Count'},
                        color_discrete_sequence=[colors_rgba[0]],
                        title="",
                        height=500,
                        marginal="box"  # Add boxplot for additional insight
                    )
                    fig_followers.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig_followers, use_container_width=True)

                elif chart_option == "Distribution of Likes":
                    st.markdown('<div class="subheader">❤️ Likes Distribution</div>', unsafe_allow_html=True)
                    fig_likes = px.histogram(
                        cleaned_user_info_df,
                        x='stats.heart',
                        nbins=num_bins,
                        log_y=log_scale,
                        labels={'stats.heart': 'Likes'},
                        color_discrete_sequence=[colors_rgba[1]],
                        title="",
                        height=500,
                        marginal="box"
                    )
                    fig_likes.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig_likes, use_container_width=True)

                elif chart_option == "Distribution of Video Count":
                    st.markdown('<div class="subheader">🎬 Video Count Distribution</div>', unsafe_allow_html=True)
                    fig_videos = px.histogram(
                        cleaned_user_info_df,
                        x='stats.videoCount',
                        nbins=num_bins,
                        log_y=log_scale,
                        labels={'stats.videoCount': 'Video Count'},
                        color_discrete_sequence=[colors_rgba[2]],
                        title="",
                        height=500,
                        marginal="box"
                    )
                    fig_videos.update_layout(template="plotly_dark", bargap=0.1, showlegend=False)
                    st.plotly_chart(fig_videos, use_container_width=True)

                elif chart_option == "All in One":
                    st.markdown('<div class="subheader">📊 Combined Distribution</div>', unsafe_allow_html=True)
                    melted_df = cleaned_user_info_df.melt(
                        value_vars=['stats.followerCount', 'stats.heart', 'stats.videoCount'],
                        var_name="Metric",
                        value_name="Count"
                    )
                    metric_names = {"stats.followerCount": "Followers", "stats.heart": "Likes", "stats.videoCount": "Videos"}
                    melted_df["Metric"] = melted_df["Metric"].map(metric_names)

                    fig_combined = px.histogram(
                        melted_df,
                        x="Count",
                        color="Metric",
                        log_y=log_scale,
                        nbins=num_bins,
                        title="",
                        color_discrete_map={"Followers": colors_rgba[0], "Likes": colors_rgba[1], "Videos": colors_rgba[2]},
                        height=500,
                        opacity=0.7,
                        barmode="overlay"  # Overlay bars for better comparison
                    )
                    fig_combined.update_layout(template="plotly_dark", bargap=0.1, legend_title_text="Metrics")
                    st.plotly_chart(fig_combined, use_container_width=True)

                elif chart_option == "Scatter Matrix":
                    st.markdown('<div class="subheader">🔍 Scatter Matrix</div>', unsafe_allow_html=True)
                    fig_scatter = px.scatter_matrix(
                        cleaned_user_info_df,
                        dimensions=['stats.followerCount', 'stats.heart', 'stats.videoCount'],
                        labels={"stats.followerCount": "Followers", "stats.heart": "Likes", "stats.videoCount": "Videos"},
                        color_discrete_sequence=colors_rgba,
                        height=600,
                        opacity=0.6
                    )
                    fig_scatter.update_traces(diagonal_visible=False)  # Hide diagonal histograms
                    fig_scatter.update_layout(template="plotly_dark")
                    st.plotly_chart(fig_scatter, use_container_width=True)

            # Download button with a modern touch
            data_to_download = cleaned_user_info_df[['stats.followerCount', 'stats.heart', 'stats.videoCount']]
            st.download_button(
                label="📥 Download Chart Data",
                data=data_to_download.to_csv(index=False),
                file_name=f"{chart_option.lower().replace(' ', '_')}.csv",
                mime="text/csv"
            )

        # Sidebar with stats and correlation
        with col2:
            with st.expander("📋 Quick Stats", expanded=True):
                st.markdown("#### Summary")
                st.write(f"**Total Users:** {len(cleaned_user_info_df):,}")
                st.write(f"**Avg. Followers:** {cleaned_user_info_df['stats.followerCount'].mean():,.0f}")
                st.write(f"**Avg. Likes:** {cleaned_user_info_df['stats.heart'].mean():,.0f}")
                st.write(f"**Avg. Videos:** {cleaned_user_info_df['stats.videoCount'].mean():,.0f}")

            st.markdown('<div class="subheader">📊 Correlation Heatmap</div>', unsafe_allow_html=True)
            correlation_matrix = cleaned_user_info_df[['stats.followerCount', 'stats.heart', 'stats.videoCount']].corr()

            fig_corr = px.imshow(
                correlation_matrix,
                text_auto=".2f",
                aspect="equal",
                title="",
                color_continuous_scale="Blues",
                labels=dict(x="Metrics", y="Metrics", color="Correlation"),
                height=400
            )
            fig_corr.update_layout(
                xaxis=dict(tickvals=[0, 1, 2], ticktext=["Followers", "Likes", "Videos"]),
                yaxis=dict(tickvals=[0, 1, 2], ticktext=["Followers", "Likes", "Videos"])
            )
            st.plotly_chart(fig_corr, use_container_width=True)

    # Tab 2: Top Users (Moved from Page 2)
    with tab2:
        st.header("🏆 Top Users")
        st.write("View the most influential TikTok users.")
        # Main title with emoji and custom styling
        st.markdown('<div class="main-title" style="font-size: 28px; font-weight: bold; color: #1E90FF;">🏆 Top Users Analysis</div>', unsafe_allow_html=True)

        # Add a brief description for context
        st.markdown('<p style="color: #666;">Explore the top-performing users based on engagement metrics like likes and video counts.</p>', unsafe_allow_html=True)

        # Improved layout with three columns for controls
        col1, col2, col3 = st.columns([1, 1, 1])
        with col1:
            top_n = st.slider("Select Top N Users", min_value=1, max_value=50, value=10, step=1,
                              help="Adjust to see more or fewer top users.")
        with col2:
            chart_option = st.selectbox("Select Chart Type",
                                        ["Most Likes", "Most Videos", "Most Followers", "Engagement Rate"],
                                        help="Choose a metric to visualize user performance.")
        with col3:
            sort_order = st.radio("Sort Order", ["Descending", "Ascending"], index=0,
                                  help="Change the sorting direction of the chart.")

        # Tabs for different visualizations
        tab_bar, tab_pie = st.tabs(["Bar Chart", "Pie Chart"])

        # Data preparation with progress indicator
        with st.spinner(f"Generating visuals for Top {top_n} Users..."):
            # Base data selection
            if chart_option == "Most Likes":
                metric = 'stats.heart'
                title = f"Top {top_n} Users with Most Likes"
                color_scale = 'reds'
                y_label = "Total Likes (Hearts)"
            elif chart_option == "Most Videos":
                metric = 'stats.videoCount'
                title = f"Top {top_n} Users with Most Videos"
                color_scale = 'greens'
                y_label = "Total Videos"
            elif chart_option == "Most Followers":
                metric = 'stats.followerCount'
                title = f"Top {top_n} Users with Most Followers"
                color_scale = 'blues'
                y_label = "Total Followers"
            else:  # Engagement Rate (Likes per Video)
                cleaned_user_info_df['engagement_rate'] = cleaned_user_info_df['stats.heart'] / cleaned_user_info_df['stats.videoCount'].replace(0, 1)  # Avoid division by zero
                metric = 'engagement_rate'
                title = f"Top {top_n} Users by Engagement Rate (Likes/Video)"
                color_scale = 'purples'
                y_label = "Engagement Rate"

            # Sort data based on user selection
            top_data = (cleaned_user_info_df.nlargest(top_n, metric) if sort_order == "Descending"
                        else cleaned_user_info_df.nsmallest(top_n, metric))[['user.uniqueId', metric]]

            # Bar Chart
            with tab_bar:
                st.markdown(f'<div class="subheader" style="color: #333; font-weight: bold;">{title}</div>', unsafe_allow_html=True)
                fig_bar = px.bar(top_data,
                                x='user.uniqueId',
                                y=metric,
                                title=title,
                                color=metric,
                                color_continuous_scale=color_scale,
                                text=top_data[metric].apply(lambda x: f'{x:,.1f}' if metric == 'engagement_rate' else f'{x:,}'),
                                height=500)
                fig_bar.update_traces(textposition='outside', textfont_size=12)
                fig_bar.update_layout(xaxis_title="User ID",
                                     yaxis_title=y_label,
                                     template="plotly_white",
                                     xaxis_tickangle=-45,
                                     showlegend=False,
                                     margin=dict(t=50, b=50),
                                     plot_bgcolor="rgba(0,0,0,0)",
                                     paper_bgcolor="rgba(0,0,0,0)")
                st.plotly_chart(fig_bar, use_container_width=True)

            # Pie Chart
            with tab_pie:
                st.markdown(f'<div class="subheader" style="color: #333; font-weight: bold;">Distribution of {chart_option}</div>', unsafe_allow_html=True)
                fig_pie = px.pie(top_data,
                                names='user.uniqueId',
                                values=metric,
                                title=f"Distribution of {chart_option} (Top {top_n})",
                                height=500,
                                color_discrete_sequence=px.colors.sequential.RdBu)
                fig_pie.update_traces(textinfo='percent+label', pull=[0.1] + [0]*(top_n-1))
                fig_pie.update_layout(template="plotly_white",
                                     margin=dict(t=50, b=50),
                                     plot_bgcolor="rgba(0,0,0,0)",
                                     paper_bgcolor="rgba(0,0,0,0)")
                st.plotly_chart(fig_pie, use_container_width=True)

        # Expander for detailed data table
        with st.expander("View Detailed Data", expanded=False):
            st.dataframe(top_data.style.format({metric: "{:,.2f}" if metric == 'engagement_rate' else "{:,}"}),
                         use_container_width=True)

        # Download button with styled CSV export
        csv = top_data.to_csv(index=False)
        st.download_button(
            label="📥 Download Data as CSV",
            data=csv,
            file_name=f"top_{top_n}_{chart_option.lower().replace(' ', '_')}_{sort_order.lower()}.csv",
            mime="text/csv",
            help="Download the current dataset as a CSV file."
        )

    # Tab 3: Engagement Insights (Moved from Page 3)
    with tab3:
        st.header("🔥 Engagement Insights")
        st.write("Understand how engagement levels vary across different user segments.")
        st.markdown('<h2 style="text-align:center;">🏆 Engagement Analysis</h2>', unsafe_allow_html=True)

        st.write("Analyze the relationship between **followers** and **engagement (likes/hearts)**.")

        # Enhanced UI Layout
        with st.container():
            col1, col2, col3 = st.columns([1, 1, 2])

            with col1:
                follower_level = st.selectbox(
                    "📌 Select Follower Level:",
                    ["Low", "Average", "High"],
                    help="Low: 0-33%, Average: 33-66%, High: 66-100%"
                )

            with col2:
                engagement_level = st.selectbox(
                    "🔥 Select Engagement Level:",
                    ["Low", "Average", "High"],
                    help="Based on engagement ratio (Likes / Followers)"
                )

            with col3:
                plotly_theme = st.selectbox("🎨 Choose Theme:", ["plotly_dark", "seaborn", "ggplot2", "plotly_white"])

        # Calculate Engagement Ratio
        cleaned_user_info_df['engagement_ratio'] = cleaned_user_info_df['stats.heart'] / cleaned_user_info_df['stats.followerCount'].replace(0, 1)

        # Define Percentile Ranges for Filtering
        percentiles = [0, 0.33, 0.66, 1]
        low_followers = cleaned_user_info_df['stats.followerCount'].quantile(percentiles[1])
        high_followers = cleaned_user_info_df['stats.followerCount'].quantile(percentiles[2])
        low_engagement = cleaned_user_info_df['engagement_ratio'].quantile(percentiles[1])
        high_engagement = cleaned_user_info_df['engagement_ratio'].quantile(percentiles[2])

        # Apply Filters
        if follower_level == "Low":
            filtered_df = cleaned_user_info_df[cleaned_user_info_df['stats.followerCount'] <= low_followers]
        elif follower_level == "Average":
            filtered_df = cleaned_user_info_df[(cleaned_user_info_df['stats.followerCount'] > low_followers) &
                                              (cleaned_user_info_df['stats.followerCount'] <= high_followers)]
        else:
            filtered_df = cleaned_user_info_df[cleaned_user_info_df['stats.followerCount'] > high_followers]

        if engagement_level == "Low":
            filtered_df = filtered_df[filtered_df['engagement_ratio'] <= low_engagement]
        elif engagement_level == "Average":
            filtered_df = filtered_df[(filtered_df['engagement_ratio'] > low_engagement) &
                                      (filtered_df['engagement_ratio'] <= high_engagement)]
        else:
            filtered_df = filtered_df[filtered_df['engagement_ratio'] > high_engagement]

        # Scatter Plot (Enhanced)
        with st.spinner("📊 Rendering engagement analysis..."):
            fig_filtered_users = px.scatter(
                filtered_df,
                x='stats.followerCount',
                y='stats.heart',
                size='stats.followerCount',
                color='engagement_ratio',
                color_continuous_scale='viridis',
                hover_data=['user.uniqueId'],
                height=600,
                opacity=0.75  # More transparent points for better readability
            )

            fig_filtered_users.update_layout(
                xaxis_title="👥 Follower Count",
                yaxis_title="❤️ Total Likes",
                template=plotly_theme,
                xaxis_type="log",  # Log scale improves visualization
                yaxis_type="log",
                showlegend=True,
                coloraxis_colorbar_title="Engagement Ratio 🔥"
            )

        # Display Chart with Subheader
        st.markdown(f'<h3>📊 Engagement Insights: {follower_level} Followers & {engagement_level} Engagement</h3>', unsafe_allow_html=True)
        # Quick Stats (Improved UX)
        col1, col2, col3 = st.columns(3)

        with col1:
            st.metric("📌 Number of Users", f"{len(filtered_df):,}")
        with col2:
            st.metric("❤️ Avg. Likes", f"{filtered_df['stats.heart'].mean():,.0f}")
        with col3:
            st.metric("👥 Avg. Followers", f"{filtered_df['stats.followerCount'].mean():,.0f}")
        st.plotly_chart(fig_filtered_users, use_container_width=True)

        # Additional Insights
        st.markdown("""
        **🔍 Key Takeaways:**
        - Higher engagement doesn't always come from high followers!
        - Some small creators can outperform big ones in engagement.
        - Use trending hashtags and sounds to boost visibility!
        """)
# ---------------- PAGE 4: Personal Analysis ----------------
elif page == "Personal Analysis":
    # Custom Styling
    st.markdown("""
        <style>
        h1, h2, h3 { color: #1f2a44; font-family: 'Helvetica', sans-serif; }
        .stMetric { border-radius: 8px; padding: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        </style>
    """, unsafe_allow_html=True)

    # Convert createTime
    cleaned_video_info_df['createTime'] = pd.to_datetime(cleaned_video_info_df['createTime'], unit='s')
    tiktoker_options = cleaned_video_info_df['author.uniqueId'].unique()
    min_date = cleaned_video_info_df['createTime'].min().date()
    max_date = cleaned_video_info_df['createTime'].max().date()

    # Sidebar
    with st.sidebar:
        st.title("📊 TikTok Analytics")
        st.markdown("Analyze TikTok trends")
        selected_tiktoker = st.selectbox("👤 Select TikToker", tiktoker_options)
        date_range = st.slider("📅 Date Range", min_value=min_date, max_value=max_date,
                               value=(min_date, max_date), format="MM/DD/YYYY")
        start_date, end_date = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1])
        if st.button("🔄 Reset"):
            start_date, end_date = pd.to_datetime(min_date), pd.to_datetime(max_date)

    # Main Content
    st.header(f"@{selected_tiktoker}'s Analytics")
    tiktoker_data = cleaned_video_info_df[cleaned_video_info_df['author.uniqueId'] == selected_tiktoker]

    if not tiktoker_data.empty:
        # Profile Card
        user_info = tiktoker_data.iloc[0]
        with st.container():
            st.subheader("Profile Overview")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Username", user_info['author.uniqueId'])
                st.metric("Followers", f"{user_info['authorStats.followerCount']:,}")
            with col2:
                st.metric("Commerce", user_info.get('user.commerceUserInfo.category', 'No info'))
                st.metric("Total Likes", f"{user_info['authorStats.heartCount']:,}")
            with col3:
                st.metric("Verified", "Yes ✅" if user_info['author.verified'] else "No ❌")
                st.metric("Total Videos", f"{user_info['authorStats.videoCount']:,}")

        # Filtered Data
        with st.spinner("Loading data..."):
            filtered_data = tiktoker_data[(tiktoker_data['createTime'] >= start_date) &
                                          (tiktoker_data['createTime'] <= end_date)]

        if not filtered_data.empty:
            # Video Trends
            with st.expander("📈 Video Trends", expanded=True):
                video_counts = filtered_data.groupby(filtered_data['createTime'].dt.date).size().reset_index(name='Video Count')
                fig_trend = px.area(video_counts, x='createTime', y='Video Count',
                                    title="Video Creation Over Time", template="plotly_white")
                fig_trend.update_traces(line=dict(color="#00b4d8", width=2), fill='tozeroy')
                fig_trend.add_scatter(x=video_counts['createTime'], y=video_counts['Video Count'],
                                      mode='markers', marker=dict(size=8, color="#00b4d8"))
                max_day = video_counts.loc[video_counts['Video Count'].idxmax()]
                fig_trend.add_annotation(x=max_day['createTime'], y=max_day['Video Count'],
                                         text=f"Peak: {max_day['Video Count']}", showarrow=True, arrowhead=1)
                fig_trend.update_layout(xaxis_title="Date", yaxis_title="Videos Posted", showlegend=False)
                st.plotly_chart(fig_trend, use_container_width=True)

            # Music Usage
            with st.expander("🎵 Music Usage"):
                music_counts = filtered_data['music.authorName'].value_counts().head(10).reset_index()
                music_counts.columns = ['Music Author', 'Count']
                fig_music = px.bar(music_counts, x='Count', y='Music Author', orientation='h',
                                   title="Top 10 Music Choices", color='Count', color_continuous_scale='magma')
                fig_music.update_layout(xaxis_title="Times Used", yaxis_title="", showlegend=False)
                st.plotly_chart(fig_music, use_container_width=True)

            # Hashtag Usage
            with st.expander("🏷️ Hashtag Usage"):
                all_hashtags = filtered_data['hashtags'].dropna().str.split().explode()
                if not all_hashtags.empty:
                    hashtag_counts = all_hashtags.value_counts().head(10).reset_index()
                    hashtag_counts.columns = ['Hashtag', 'Count']
                    fig_hashtags = px.treemap(hashtag_counts, path=['Hashtag'], values='Count',
                                              title="Top 10 Hashtags", color='Count', color_continuous_scale='viridis')
                    fig_hashtags.update_layout(margin=dict(t=50, l=0, r=0, b=0))
                    st.plotly_chart(fig_hashtags, use_container_width=True)
                else:
                    st.markdown('<p style="color:#3498db;">ℹ️ No hashtags available.</p>', unsafe_allow_html=True)
        else:
            st.markdown(f'<p style="color:#e67e22;">⚠️ No video data for {selected_tiktoker} in this range.</p>',
                        unsafe_allow_html=True)
    else:
        st.markdown(f'<p style="color:#c0392b;">❌ No data for {selected_tiktoker}.</p>', unsafe_allow_html=True)
# ---------------- PAGE 4: Hashtag & Song Analysis ----------------
elif page == "Hashtag & Song Analysis":
    # Custom CSS
    st.markdown("""
        <style>

        h1, h3 { color: #2c3e50; font-family: 'Arial', sans-serif; }
        </style>
    """, unsafe_allow_html=True)

    # Ensure datetime format
    cleaned_video_info_df['createTime'] = pd.to_datetime(cleaned_video_info_df['createTime'], unit='s')
    min_date = cleaned_video_info_df['createTime'].min().date()
    max_date = cleaned_video_info_df['createTime'].max().date()

    # Sidebar filters
    with st.sidebar:
        st.title("🔍 Filters")
        date_range = st.slider("📅 Date Range", min_value=min_date, max_value=max_date, value=(min_date, max_date), format="YYYY-MM-DD")
        start_date, end_date = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1])
        top_k = st.number_input("🔢 Top K", min_value=1, max_value=50, value=10, step=1)

    st.title("🔍 Hashtag & Song Analysis")

    # Filter data
    with st.spinner("Analyzing data..."):
        filtered_data = cleaned_video_info_df[(cleaned_video_info_df['createTime'] >= start_date) & (cleaned_video_info_df['createTime'] <= end_date)]

    if not filtered_data.empty:
        # Hashtag Analysis
        with st.expander("🔥 Most Used Hashtags", expanded=True):
            all_hashtags = filtered_data['hashtags'].dropna().str.split().explode()
            if not all_hashtags.empty:
                hashtag_counts = all_hashtags.value_counts().reset_index()
                hashtag_counts.columns = ['Hashtag', 'Count']
                top_hashtags = hashtag_counts.head(top_k)

                fig_hashtags = px.bar(top_hashtags, x='Count', y='Hashtag', orientation='h',
                                      title="📌 Most Used Hashtags", color='Count', color_continuous_scale='viridis')
                fig_hashtags.update_layout(xaxis_title="Usage Count", yaxis_title="Hashtag", template="plotly_white")
                st.plotly_chart(fig_hashtags, use_container_width=True)
                if st.checkbox("Show Hashtag Table"):
                    st.dataframe(top_hashtags)
            else:
                st.markdown('<p style="color:#e74c3c;">⚠️ No hashtags found.</p>', unsafe_allow_html=True)

        # Song Analysis
        with st.expander("🎵 Most Used Songs", expanded=True):
            music_counts = filtered_data['music.authorName'].value_counts().reset_index()
            music_counts.columns = ['Music Author', 'Count']
            top_music = music_counts.head(top_k)

            if not top_music.empty:
                fig_music = px.bar(top_music, x='Count', y='Music Author', orientation='h',
                                   title="🎶 Most Used Songs", color='Count', color_continuous_scale='viridis')
                fig_music.update_layout(xaxis_title="Usage Count", yaxis_title="Music Author", template="plotly_white")
                st.plotly_chart(fig_music, use_container_width=True)
                if st.checkbox("Show Song Table"):
                    st.dataframe(top_music)
            else:
                st.markdown('<p style="color:#e74c3c;">⚠️ No songs found.</p>', unsafe_allow_html=True)
    else:
        st.markdown('<p style="color:#e74c3c;">⚠️ No data available for this range.</p>', unsafe_allow_html=True)
# Add a footer with last updated timestamp
# Convert to Vietnam Time (UTC+7)
vn_timezone = pytz.timezone("Asia/Ho_Chi_Minh")
vn_time = datetime.now(vn_timezone).strftime("%Y-%m-%d %H:%M:%S")
st.markdown(f'''
    <div style="position: absolute; bottom: 10px; right: 20px; margin: -20px ;color: #999; font-size: 18px;">
        🕒 Last updated: {vn_time}
    </div>
''', unsafe_allow_html=True)

Writing app2.py


In [5]:
%%writefile readme.md
# TikTok Analytics Dashboard

A Streamlit-based web application for analyzing TikTok user data, engagement metrics, and trends. This dashboard provides interactive visualizations and insights into follower distributions, top users, engagement levels, personal analytics, and hashtag/song usage patterns.

## Features

- **Correlation Analysis**: Explore relationships between followers, likes, and video counts with histograms, scatter matrices, and correlation heatmaps.
- **Top Users**: Identify influential TikTok users based on likes, video counts, followers, or engagement rates.
- **Engagement Insights**: Analyze how engagement varies across different follower and engagement levels.
- **Personal Analysis**: Dive into an individual TikToker's profile, video trends, music usage, and hashtag patterns.
- **Hashtag & Song Analysis**: Discover the most popular hashtags and songs within a selected date range.
- **Interactive Visualizations**: Built with Plotly for dynamic, customizable charts.
- **Downloadable Data**: Export chart data as CSV files for further analysis.

## Project Structure

```
tiktok_analytics/
│
├── app.py                  # Main entry point for the Streamlit app
├── styles.py               # Custom CSS and styling definitions
├── data_loader.py          # Data loading utilities
├── pages/
│   ├── correlation_analysis.py  # Correlation Analysis page with tabs
│   ├── personal_analysis.py     # Personal Analysis page
│   └── hashtag_song_analysis.py # Hashtag & Song Analysis page
├── footer.py               # Footer with last updated timestamp
└── README.md               # Project documentation (this file)
```

### File Descriptions

- **`app.py`**: The main script that initializes the app, sets up the sidebar navigation, and routes to different pages.
- **`styles.py`**: Contains custom CSS for consistent styling across the app, including global, personal, and hashtag/song-specific styles.
- **`data_loader.py`**: Loads the cleaned TikTok user and video datasets from CSV files.
- **`pages/correlation_analysis.py`**: Implements the "Correlation Analysis" page with three tabs: Correlation Analysis, Top Users, and Engagement Insights.
- **`pages/personal_analysis.py`**: Implements the "Personal Analysis" page for individual TikToker analytics.
- **`pages/hashtag_song_analysis.py`**: Implements the "Hashtag & Song Analysis" page for trending hashtags and songs.
- **`footer.py`**: Displays a timestamp in Vietnam time (UTC+7) at the bottom of the app.

## Prerequisites

- Python 3.8+
- Required Python packages:
  - `streamlit`
  - `pandas`
  - `plotly`
  - `pytz`

## Installation

1. **Clone the Repository** (if hosted on a version control system):
   ```bash
   git clone <repository-url>
   cd tiktok_analytics
   ```
   Alternatively, manually create the directory structure and copy the files.

2. **Install Dependencies**:
   ```bash
   pip install streamlit pandas plotly pytz
   ```

3. **Prepare Data**:
   - Place your `cleaned_user_info.csv` and `cleaned_video_info.csv` files in the directory `/content/21KHDL-TikTok-Analytics/data/interim/`.
   - Update the file paths in `data_loader.py` if your data is stored elsewhere.

## Running the Application

1. Navigate to the project directory:
   ```bash
   cd tiktok_analytics
   ```
2. Run the Streamlit app:
   ```bash
   streamlit run app.py
   ```
3. Open your browser to `http://localhost:8501` to view the dashboard.

## Usage

- **Sidebar Navigation**: Use the radio buttons in the sidebar to switch between pages: "Correlation Analysis", "Personal Analysis", and "Hashtag & Song Analysis".
- **Interactive Controls**: Adjust sliders, dropdowns, and checkboxes to customize visualizations.
- **Download Data**: Click the "Download" buttons to export chart data as CSV files.
- **Date Range Filtering**: Available in "Personal Analysis" and "Hashtag & Song Analysis" pages to focus on specific time periods.

### Example Screenshots

*(You can add screenshots here by running the app and capturing outputs, then linking them in this section.)*

## Data Requirements

- **`cleaned_user_info.csv`**:
  - Expected columns: `stats.followerCount`, `stats.heart`, `stats.videoCount`, `user.uniqueId`, etc.
- **`cleaned_video_info.csv`**:
  - Expected columns: `createTime`, `author.uniqueId`, `authorStats.followerCount`, `authorStats.heartCount`, `authorStats.videoCount`, `author.verified`, `music.authorName`, `hashtags`, etc.
- Ensure the data is pre-cleaned and formatted correctly for the app to function as expected.

## Contributing

1. Fork the repository (if applicable).
2. Create a new branch for your feature or bug fix:
   ```bash
   git checkout -b feature-name
   ```
3. Make changes and test locally.
4. Commit and push your changes:
   ```bash
   git commit -m "Description of changes"
   git push origin feature-name
   ```
5. Submit a pull request with a detailed description of your changes.

## Notes

- The app assumes the data files are available at the specified paths. Adjust `data_loader.py` if your file locations differ.
- The "Last updated" timestamp reflects Vietnam time (Asia/Ho_Chi_Minh timezone).
- For large datasets, rendering times may increase; consider optimizing data loading or adding caching with `st.cache`.

## License

This project is open-source and available under the [MIT License](LICENSE). *(Add a LICENSE file if you choose to include one.)*

## Contact

For questions or feedback, feel free to reach out to the maintainers at [hddluc21@fitus.clc.edu.com].


Overwriting readme.md


In [23]:
cleaned_user_info_df.head()

Unnamed: 0,stats.diggCount,stats.followerCount,stats.followingCount,stats.friendCount,stats.heart,stats.heartCount,stats.videoCount,user.canExpPlaylist,user.commentSetting,user.commerceUserInfo.commerceUser,...,user.uniqueId,user.verified,user.bioLink.link,user.bioLink.risk,user.commerceUserInfo.category,user.commerceUserInfo.categoryButton,user.profileTab.showQuestionTab,user.profileTab.showMusicTab,user.uniqueIdModifyTime,user.roomId
0,0,198700,47,28,4100000,4100000,847,True,0,False,...,1phutsaigon,False,,,,,,,,
1,0,637500,185,61,13300000,13300000,3756,True,0,True,...,anchoivungtau72,True,Anchoivungtau.vn,3.0,Travel & Tourism,False,True,,,
2,0,183500,0,0,4700000,4700000,872,True,0,False,...,ancungdaune,False,,,,,,,,
3,0,404000,75,57,14200000,14200000,566,True,0,True,...,angithuongoi,False,,,Food & Beverage,False,,,,
4,0,1300000,68,9,17100000,17100000,218,True,0,True,...,anhdaubep_vn,False,anhdaubep.vn,3.0,Food & Beverage,False,,,,


In [24]:
cleaned_user_info_df.columns

Index(['stats.diggCount', 'stats.followerCount', 'stats.followingCount',
       'stats.friendCount', 'stats.heart', 'stats.heartCount',
       'stats.videoCount', 'user.canExpPlaylist', 'user.commentSetting',
       'user.commerceUserInfo.commerceUser', 'user.downloadSetting',
       'user.duetSetting', 'user.followingVisibility', 'user.ftc', 'user.id',
       'user.isADVirtual', 'user.isEmbedBanned', 'user.nickNameModifyTime',
       'user.nickname', 'user.openFavorite', 'user.privateAccount',
       'user.profileEmbedPermission', 'user.profileTab.showPlayListTab',
       'user.relation', 'user.secUid', 'user.secret', 'user.signature',
       'user.stitchSetting', 'user.ttSeller', 'user.uniqueId', 'user.verified',
       'user.bioLink.link', 'user.bioLink.risk',
       'user.commerceUserInfo.category',
       'user.commerceUserInfo.categoryButton',
       'user.profileTab.showQuestionTab', 'user.profileTab.showMusicTab',
       'user.uniqueIdModifyTime', 'user.roomId'],
      dtype='o

In [25]:
cleaned_video_info_df.columns

Index(['author.privateAccount', 'author.uniqueId', 'author.verified',
       'authorStats.followerCount', 'authorStats.heartCount',
       'authorStats.videoCount', 'createTime', 'desc', 'music.authorName',
       'music.isCopyrighted', 'music.title', 'stats.commentCount',
       'stats.diggCount', 'stats.playCount', 'stats.shareCount',
       'video.bitrate', 'video.claInfo.enableAutoCaption',
       'video.claInfo.hasOriginalAudio', 'video.codecType', 'video.definition',
       'video.duration', 'video.format', 'video.height', 'video.videoQuality',
       'video.width', 'hashtags'],
      dtype='object')