In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import chardet
import re
from pathlib import Path
import os
import sys
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
# plt.style.use('seaborn')
sns.set_palette("husl")

In [3]:
def validate_columns(df):
    """Validate and handle required columns"""
    required_columns = {
        'desc': str,
        'stats.playCount': float,
        'stats.diggCount': float,
        'stats.shareCount': float,
        'stats.commentCount': float,
        'video.duration': float
    }
    
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Warning: Missing columns: {missing_columns}")
        for col in missing_columns:
            df[col] = required_columns[col]()
    
    return df

In [4]:
def detect_encoding(file_path, sample_size=10000):
    """Detect file encoding using chardet"""
    with open(file_path, 'rb') as f:
        raw_data = f.read(sample_size)
    return chardet.detect(raw_data)['encoding']

def load_and_preprocess_data(file_path):
    """Load and preprocess the TikTok data with robust error handling"""
    try:
        file_path = Path(file_path)
        if not file_path.exists():
            print(f"Error: File not found at {file_path}")
            return None

        # Detect encoding dynamically
        detected_encoding = detect_encoding(file_path)
        print(f"Detected encoding: {detected_encoding}")

        # Load with detected encoding
        df = pd.read_csv(file_path, sep='\t', encoding=detected_encoding, on_bad_lines='skip')
        print("Successfully loaded data!")

        # Validate required columns
        df = validate_columns(df)

        # Data preprocessing
        df['desc'] = df['desc'].fillna('')
        df['clean_desc'] = df['desc'].apply(lambda x: re.sub(r'[^\w\s#]', '', str(x).lower()))
        df['hashtags'] = df['desc'].apply(lambda x: re.findall(r'#(\w+)', str(x).lower()))
        df['hashtag_count'] = df['hashtags'].apply(len)

        # Convert numeric columns
        numeric_columns = ['stats.playCount', 'stats.diggCount', 'stats.shareCount', 
                           'stats.commentCount', 'video.duration']
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

        return df

    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [5]:
def plot_video_metrics_distribution(df):
    """Analyze distribution of key video metrics with outlier handling"""
    metrics = ['stats.playCount', 'stats.diggCount', 'stats.shareCount', 'stats.commentCount']
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Distribution of Video Metrics (Excluding Outliers)', fontsize=16)
    
    for i, metric in enumerate(metrics):
        ax = axes[i//2, i%2]
        
        # Remove extreme outliers for visualization
        q1 = df[metric].quantile(0.25)
        q3 = df[metric].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        
        # Plot histogram and KDE
        sns.histplot(data=df[df[metric] <= upper_bound], x=metric, ax=ax, bins=30, kde=True)
        
        ax.set_title(f'{metric.split(".")[-1]} Distribution')
        ax.set_xlabel(metric.split('.')[-1])
        ax.set_ylabel('Count')
        
        # Add statistics
        stats = df[metric].describe()
        stats_text = (f'Mean: {stats["mean"]:,.0f}\n'
                     f'Median: {stats["50%"]:,.0f}\n'
                     f'Max: {stats["max"]:,.0f}\n'
                     f'% Outliers: {(df[metric] > upper_bound).mean()*100:.1f}%')
        
        ax.text(0.95, 0.95, stats_text,
                transform=ax.transAxes,
                verticalalignment='top',
                horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

In [6]:
def analyze_engagement_patterns(df):
    """Analyze engagement patterns and correlations"""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    # Engagement rate distribution
    sns.histplot(data=df[df['engagement_rate'] <= df['engagement_rate'].quantile(0.95)],
                x='engagement_rate', bins=30, ax=ax1, kde=True)
    ax1.set_title('Distribution of Engagement Rates (Excluding Top 5%)')
    ax1.set_xlabel('Engagement Rate')
    ax1.set_ylabel('Count')
    
    # Add engagement statistics
    stats = df['engagement_rate'].describe()
    stats_text = (f'Mean: {stats["mean"]:.3f}\n'
                 f'Median: {stats["50%"]:.3f}\n'
                 f'95th percentile: {stats["95%"]:.3f}')
    ax1.text(0.95, 0.95, stats_text,
             transform=ax1.transAxes,
             verticalalignment='top',
             horizontalalignment='right',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # Correlation matrix
    metrics = ['stats.playCount', 'stats.diggCount', 'stats.shareCount', 
              'stats.commentCount', 'hashtag_count', 'duration_min']
    corr = df[metrics].corr()
    
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=ax2, fmt='.2f')
    ax2.set_title('Correlation Between Video Metrics')
    
    plt.tight_layout()
    plt.show()

def analyze_hashtags_and_duration(df):
    """Analyze hashtag usage and video duration patterns"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Hashtag count distribution
    sns.histplot(data=df, x='hashtag_count', bins=range(0, df['hashtag_count'].max()+2),
                ax=ax1, discrete=True)
    ax1.set_title('Distribution of Hashtags per Video')
    ax1.set_xlabel('Number of Hashtags')
    ax1.set_ylabel('Count')
    
    # Top hashtags
    all_hashtags = [tag for tags in df['hashtags'] for tag in tags]
    top_hashtags = pd.Series(all_hashtags).value_counts().head(10)
    
    sns.barplot(x=top_hashtags.values, y=top_hashtags.index, ax=ax2)
    ax2.set_title('Top 10 Most Used Hashtags')
    ax2.set_xlabel('Count')
    
    # Video duration distribution
    sns.histplot(data=df[df['duration_min'] <= df['duration_min'].quantile(0.95)],
                x='duration_min', bins=30, ax=ax3, kde=True)
    ax3.set_title('Video Duration Distribution (Excluding Top 5%)')
    ax3.set_xlabel('Duration (minutes)')
    ax3.set_ylabel('Count')
    
    # Duration vs engagement
    sns.scatterplot(data=df[df['duration_min'] <= df['duration_min'].quantile(0.95)],
                    x='duration_min', y='engagement_rate', ax=ax4, alpha=0.5)
    ax4.set_title('Duration vs Engagement Rate')
    ax4.set_xlabel('Duration (minutes)')
    ax4.set_ylabel('Engagement Rate')
    
    plt.tight_layout()
    plt.show()

def create_wordcloud(df):
    """Create word cloud from video descriptions"""
    text = ' '.join(df['clean_desc'])
    wordcloud = WordCloud(width=1200, height=600,
                         background_color='white',
                         max_words=200,
                         collocations=False).generate(text)
    
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Video Descriptions', fontsize=16)
    plt.show()

In [None]:
def main():
    default_path = "C:/Users/nguye/OneDrive/Tài liệu/GitHub/21KHDL-TikTok-Analytics/data/interim/video_info.csv"
   
    file_path = os.getenv('TIKTOK_DATA_PATH', default_path)
    
    print(f"Attempting to load data from: {file_path}")
    
    df = load_and_preprocess_data(file_path)
    if df is None:
        return
    
    print(f"\nSuccessfully loaded {len(df)} records")
    
    print("\nDataset Info:")
    print(df.info())
    
    print("\nBasic Statistics:")
    print(df[['stats.playCount', 'stats.diggCount', 'stats.shareCount', 
              'stats.commentCount', 'engagement_rate', 'duration_min']].describe())
    
    # Generate visualizations
    print("\nGenerating visualizations...")
    plot_video_metrics_distribution(df)
    analyze_engagement_patterns(df)
    analyze_hashtags_and_duration(df)
    create_wordcloud(df)
    
    # Save processed data
    try:
        output_path = Path('processed_tiktok_data.csv')
        df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"\nSaved processed data to {output_path}")
    except Exception as e:
        print(f"Error saving processed data: {str(e)}")

In [11]:
if __name__ == '__main__':
    main()  

AttributeError: module 'sys' has no attribute 'stdou'