In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import chardet
import re
from pathlib import Path
import os
import sys
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
# plt.style.use('seaborn')
sns.set_palette("husl")

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Read the data from CSV
# df = pd.read_csv('C:/Users/nguye/OneDrive/Tài liệu/GitHub/21KHDL-TikTok-Analytics/data/interim/video_info.csv')
df = pd.read_csv('C:/Users/nguye/OneDrive/Tài liệu/GitHub/21KHDL-TikTok-Analytics/data/interim/video_info.csv', encoding='utf-8')

# Ensure 'isAd' column exists and convert to boolean if needed
if 'isAd' not in df.columns:
    df['isAd'] = False  # or set to True/False based on your data

df['isAd'] = df['isAd'].astype(bool)

# Ensure all required metrics columns exist
required_columns = [
    'authorStats.followerCount', 'authorStats.followingCount', 'authorStats.heartCount', 
    'authorStats.videoCount', 'stats.commentCount', 'stats.playCount', 'stats.shareCount',
    'video.VQScore', 'video.duration', 'video.volumeInfo.Loudness'
]

missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise KeyError(f"The following required columns are missing in the DataFrame: {missing_columns}")

# Convert columns to numeric, coercing errors to NaN
for col in required_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN values in required columns
df.dropna(subset=required_columns, inplace=True)

# 1. Create a correlation matrix heatmap for selected metrics
metrics = ['authorStats.followerCount', 'authorStats.followingCount', 'authorStats.heartCount', 
          'authorStats.videoCount', 'stats.commentCount', 'stats.playCount', 'stats.shareCount',
          'video.VQScore', 'video.duration', 'video.volumeInfo.Loudness']

# Filter out metrics that are not in the DataFrame columns
metrics = [metric for metric in metrics if metric in df.columns]

corr_matrix = df[metrics].corr()
fig1 = go.Figure(data=go.Heatmap(
    z=corr_matrix,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',
    zmin=-1,
    zmax=1
))
fig1.update_layout(
    title='Correlation Matrix of TikTok Video Metrics',
    width=1000,
    height=1000
)

# 2. Create a scatter plot matrix for engagement metrics
engagement_metrics = ['stats.playCount', 'stats.commentCount', 'stats.shareCount', 'authorStats.heartCount']
fig2 = make_subplots(rows=len(engagement_metrics), cols=len(engagement_metrics))

for i, y_metric in enumerate(engagement_metrics, 1):
    for j, x_metric in enumerate(engagement_metrics, 1):
        fig2.add_trace(
            go.Scatter(
                x=df[x_metric],
                y=df[y_metric],
                mode='markers',
                name=f'{x_metric} vs {y_metric}',
                marker=dict(size=10)
            ),
            row=i,
            col=j
        )

fig2.update_layout(
    title='Scatter Plot Matrix of Engagement Metrics',
    height=1000,
    width=1000,
    showlegend=False
)

# 3. Create a bar plot comparing video metrics by isAd
metrics_to_compare = ['stats.playCount', 'stats.commentCount', 'stats.shareCount', 'authorStats.heartCount']
fig3 = go.Figure()

for metric in metrics_to_compare:
    fig3.add_trace(go.Bar(
        name=metric,
        x=['Non-Ad', 'Ad'],
        y=[
            df[df['isAd'] == False][metric].mean(),
            df[df['isAd'] == True][metric].mean()
        ]
    ))

fig3.update_layout(
    title='Average Engagement Metrics: Ad vs Non-Ad Videos',
    barmode='group',
    width=1000,
    height=600
)

# 4. Create a bubble chart of video performance
fig4 = px.scatter(df,
    x='video.duration',
    y='stats.playCount',
    size='authorStats.heartCount',
    color='video.VQScore',
    hover_data=['stats.commentCount', 'stats.shareCount', 'video.volumeInfo.Loudness'],
    title='Video Performance Analysis'
)

fig4.update_layout(
    width=1000,
    height=600
)

# 5. Create box plots for technical metrics
technical_metrics = ['video.VQScore', 'video.duration', 'video.volumeInfo.Loudness']
fig5 = go.Figure()

for metric in technical_metrics:
    fig5.add_trace(go.Box(
        y=df[metric],
        name=metric,
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8
    ))

fig5.update_layout(
    title='Distribution of Technical Metrics',
    width=1000,
    height=600
)

# 6. Create a histogram grid for all metrics
fig6 = make_subplots(rows=4, cols=3, subplot_titles=metrics)

row = 1
col = 1
for metric in metrics:
    fig6.add_trace(
        go.Histogram(x=df[metric], name=metric),
        row=row, col=col
    )
    col += 1
    if col > 3:
        col = 1
        row += 1

fig6.update_layout(
    title='Distribution of All Metrics',
    height=1200,
    width=1200,
    showlegend=False
)

# Display all figures
for fig in [fig1, fig2, fig3, fig4, fig5, fig6]:
    fig.show()

# Calculate summary statistics
summary_stats = df[metrics].describe()
print("\nSummary Statistics:")
print(summary_stats)

# Calculate engagement rates
df['engagement_rate'] = (df['authorStats.heartCount'] + df['stats.commentCount'] + df['stats.shareCount']) / df['authorStats.followerCount'] * 100
print("\nEngagement Rate Statistics:")
print(df['engagement_rate'].describe())

# Print correlation with engagement metrics
print("\nCorrelations with Play Count:")
correlations = df[metrics].corr()['stats.playCount'].sort_values(ascending=False)
print(correlations)


Summary Statistics:
       authorStats.followerCount  authorStats.followingCount  \
count               1.638000e+03                 1638.000000   
mean                9.851360e+05                  400.686813   
std                 5.460988e+06                 1104.052067   
min                 1.100000e+01                    0.000000   
25%                 4.663750e+03                   12.000000   
50%                 2.885000e+04                   58.500000   
75%                 1.606000e+05                  267.000000   
max                 5.480000e+07                10000.000000   

       authorStats.heartCount  authorStats.videoCount  stats.commentCount  \
count            1.638000e+03             1638.000000         1638.000000   
mean             3.162210e+07              593.235653         1426.673382   
std              1.874936e+08              993.190439         6909.675099   
min              4.300000e+01                1.000000            0.000000   
25%              