In [4]:
# =============================================
# 🧹 ЭТАП 2: Предобработка данных TikTok (обновлённая версия)
# =============================================

import pandas as pd
import numpy as np
import os

# ---------------------------------------------
# 1️⃣ Загрузка исходного датасета
# ---------------------------------------------
print("📥 Loading raw dataset...")
df = pd.read_csv('data/tiktok_dataset.csv')
print("✅ Dataset loaded successfully!")
print(df.head())

# ---------------------------------------------
# 2️⃣ Проверка структуры данных
# ---------------------------------------------
print("\n📊 Dataset info:")
df.info()

# ---------------------------------------------
# 3️⃣ Удаляем дубликаты и пропуски
# ---------------------------------------------
initial_shape = df.shape
df.drop_duplicates(inplace=True)

# удаляем строки, где нет текста видео или количества просмотров
df.dropna(subset=['video_transcription_text', 'video_view_count'], inplace=True)
print(f"\n🧼 Removed duplicates and NaNs: {initial_shape} → {df.shape}")

# ---------------------------------------------
# 4️⃣ Преобразуем типы данных (если нужно)
# ---------------------------------------------
numeric_cols = [
    'video_view_count',
    'video_like_count',
    'video_share_count',
    'video_download_count',
    'video_comment_count'
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# ---------------------------------------------
# 5️⃣ Добавим столбец “engagement” — вовлечённость
# ---------------------------------------------
df['engagement'] = (
    df['video_like_count'] +
    df['video_share_count'] +
    df['video_comment_count'] +
    df['video_download_count']
)

# ---------------------------------------------
# 6️⃣ Проверим результаты
# ---------------------------------------------
print("\n📈 Sample of cleaned data:")
print(df.head())

print("\n📋 Summary statistics:")
print(df.describe())

# ---------------------------------------------
# 7️⃣ Сохраняем очищенный датасет
# ---------------------------------------------
os.makedirs('data', exist_ok=True)
df.to_csv('data/tiktok_dataset_clean.csv', index=False)

print("\n✅ Clean dataset saved successfully → data/tiktok_dataset_clean.csv")


📥 Loading raw dataset...
✅ Dataset loaded successfully!
   # claim_status    video_id  video_duration_sec  \
0  1        claim  7017666017                  59   
1  2        claim  4014381136                  32   
2  3        claim  9859838091                  31   
3  4        claim  1866847991                  25   
4  5        claim  7105231098                  19   

                            video_transcription_text verified_status  \
0  someone shared with me that drone deliveries a...    not verified   
1  someone shared with me that there are more mic...    not verified   
2  someone shared with me that american industria...    not verified   
3  someone shared with me that the metro of st. p...    not verified   
4  someone shared with me that the number of busi...    not verified   

  author_ban_status  video_view_count  video_like_count  video_share_count  \
0      under review          343296.0           19425.0              241.0   
1            active          140877.