# 預處理流程

清emoji、@@username、url -> 存成clean_text

In [1]:
# pip install emoji
# !pip install pandas



In [2]:
import pandas as pd
import re

In [None]:
# df = pd.read_csv('comments/face2face_comments_merged.csv', encoding='utf-8')
# print(f"一共有 {len(df)} 筆留言")

一共有 105186 筆留言


In [18]:
# 取不重複留言者姓名

def get_name_list(data):
    data['author_name'] = data['author_name'].apply(lambda x: str(x).strip())

    name_set = set()
    for name in data['author_name']:
        clean_name = name.strip()
        if clean_name:
            name_set.add(clean_name)

    return list(name_set)


# name_list = get_name_list(df)
# print(">>> name_list:", name_list)  
# print(f"一共有 {len(name_list)} 個不重複留言者")

In [17]:
import re
import emoji

def cleaning_text(comment_data, name_list):
    """
    對 comment_data（DataFrame，須包含欄位 'comment_text' 與 'video_title'）做以下步驟：
      1. 移除所有 HTML 標籤
      2. 移除 HTML 實體字符（如 &quot;)
      3. 移除 emoji
      4. 移除 @@username（name_list 裡定義的完整用戶名）
      5. 移除所有標點符號（保留中文字、英文字母、數字、空格）
      6. 第二次掃描：移除 name_list 中所有殘留的用戶名（不帶 @@）
      7. 清理多餘空格、過濾空白留言、計算各 video_title 掉了幾筆
    回傳 (df_cleaned, drop_count)，分別是清理後的 DataFrame 與掉落統計。
    """

    # 複製一份 comment_text 作為 cleaned_text
    df = comment_data.copy()
    df['cleaned_text'] = df['comment_text'].astype(str)

    # 1. 移除所有 HTML 標籤（<a>、<br>、<b> 以及其他任何 <...>）
    df['cleaned_text'] = df['cleaned_text'].apply(
        lambda x: re.sub(r'<a href=.*?>.*?</a>|<br>|</?b>|<[^>]+>', ' ', x)
    )

    # 2. 移除 HTML 實體字符，如 &quot; &amp; 等
    df['cleaned_text'] = df['cleaned_text'].apply(
        lambda x: re.sub(r'&[a-zA-Z]+;', '', x)
    )

    # 3. 移除 emoji（使用 emoji 套件）
    df['cleaned_text'] = df['cleaned_text'].apply(
        lambda x: emoji.replace_emoji(x, replace='')
    )

    # 4. 移除所有 @@username（完整 username 來源於 name_list）
    #    先 build 一個正則：@@(user1|user2|user3)…，精確對應 name_list 裡的每個項目
    if name_list:
        pattern_at = r'@@(' + '|'.join(map(re.escape, name_list)) + r')'

        df['cleaned_text'] = df['cleaned_text'].apply(
            lambda x: re.sub(pattern_at, ' ', x)
        )

    # 5. 清理一次多餘空格（避免後面步驟有連續空格）
    df['cleaned_text'] = df['cleaned_text'].apply(
        lambda x: re.sub(r'\s+', ' ', x).strip()
    )

    # 6. 移除標點符號，保留中文字、英文字母、數字、空格
    df['cleaned_text'] = df['cleaned_text'].apply(
        lambda x: re.sub(r'[^\w\s\u4e00-\u9fa5]', ' ', x)
    )

    # 7. 第二次掃描：把 name_list 裡還留在文字裡（不帶 @@）的用戶名都移除
    def remove_remaining_usernames(text):
        for name in name_list:
            text = text.replace(name, ' ')
        return re.sub(r'\s+', ' ', text).strip()

    df['cleaned_text'] = df['cleaned_text'].apply(remove_remaining_usernames)

    # 8. 去除空白留言
    df_cleaned = df[df['cleaned_text'].str.strip() != '']

    # 9. 計算各 video_title 在這一步被清掉的筆數
    original_counts = comment_data.groupby('video_title').size()
    after_counts = df_cleaned.groupby('video_title').size().reindex(original_counts.index, fill_value=0)
    drop_count = original_counts - after_counts
    print(f"過濾掉 {len(drop_count)} 筆留言")

    return df_cleaned


In [20]:
# 賀瓏分集檢查 'cleaned_text' 的姓名有沒有除乾淨

for i in range(0, 36):
    file_path = f"hello_comments/for_bert/video_{i}_ckip.csv"
    temp_df = pd.read_csv(file_path, encoding='utf-8')
    print(f">>> Processing video_{i} with {len(temp_df)} comments")

    # 取得不重複留言者姓名
    name_list = get_name_list(temp_df)
    # print(f">>> name_list for video_{i}: {name_list}")

    # check cleaned_text
    cleaned_df = cleaning_text(temp_df, name_list)
    cleaned_df.to_csv(f'hello_comments/for_bert/video_{i}_ckip_cleaned.csv', index=False)

>>> Processing video_0 with 1948 comments
過濾掉 1 筆留言
>>> Processing video_1 with 13270 comments


KeyboardInterrupt: 

# 計算互動

- 技術性互動：top_comment/reply/single
- 實質討論：reply/@@username

In [None]:
# df = pd.read_csv('comments/face2face_cleaned.csv')

In [12]:

def tag_interact(df, name_list):
    """
    top_interaction: 留言串的第一則
    single: 沒有任何互動的單一留言
    reply: comment_type == "reply" or @@usename in comment_text
    """
    video_id_map = {title: idx + 1 for idx, title in enumerate(df['video_title'].unique())}
    df['video_id'] = df['video_title'].map(video_id_map)

    # 建立結果列表
    discussion_results = []
    
    # 針對每部影片獨立處理
    for vid in df['video_id'].unique():
        mask = df['video_id'] == vid  # mask是用來抓出特定影片資料
        print(f"=== 處理影片 {vid} ===")
        sub_df = df[mask].reset_index()

        for i in range(len(sub_df)):
            current_row = sub_df.iloc[i].copy()
            current_type = current_row['comment_type']
            next_type = sub_df.iloc[i + 1]['comment_type'] if i + 1 < len(sub_df) else None

            if current_type == 'top_comment' and next_type == 'reply':
                current_row['interaction_type'] = "top_interaction"
            
            elif current_type == 'top_comment' and next_type == 'top_comment':
                for name in name_list:
                    if f"@@{str(name)}" in current_row['comment_text']:
                        current_row['interaction_type'] = "reply"
                    else:
                        current_row['interaction_type'] = "single"
            
            elif current_type == 'reply':
                current_row['interaction_type'] = "reply"

            discussion_results.append(current_row)
    
    # 將結果轉換為 DataFrame
    result_df = pd.DataFrame(discussion_results)
    
    # 儲存結果到 CSV
    result_df.to_csv('comments/face2face_discussion_group.csv', index=False)
    
    
    return result_df

# interaction_tag = tag_interact(df, names)

In [13]:
# count interaction type
def count_dc_group(df: pd.DataFrame) -> pd.DataFrame:
    result = []

    for vid in df['video_id'].unique():
        sub_df = df[df['video_id'] == vid]
        video_title = sub_df['video_title'].iloc[0]

        # 實質互動留言數：有參與討論串的留言
        interactive_comments = sub_df[sub_df['interaction_type'] != 'single'].shape[0]

        # 總留言數
        total_comments = sub_df.shape[0]

        # 實質互動比例
        interaction_ratio = interactive_comments / total_comments

        result.append({
            'video_id': vid,
            'video_title': video_title,
            'total_comments': total_comments,
            'interactive_comments': interactive_comments,
            'interaction_ratio': round(interaction_ratio, 
                                       4)
        })

    result_df = pd.DataFrame(result)
    result_df.to_csv('comments/face2face_discussion_counts.csv', index=False)

    return result_df

# interaction_counts = count_dc_group(interaction_tag)

# Spam tag

### 客觀條件判斷spam

- 以cleaned_text為主
- 使用者重複留言超過3次相同留言
- 留言字數<5

In [12]:
def count_spammer_comments(df):
    user_counts = df['author_name'].value_counts()
    name_count_list = []
    for name, count in user_counts.items():
        if count > 3:
            name_count_list.append({name: count})
    print(">>> 留言數量大於 3 的使用者：", name_count_list)

    # if same user comment the SAME commnet more than 3 times, spam_tag == 'spam', else 'non-spam'ArithmeticError
    df['spam_tag'] = ''

    for idx, row in df.iterrows():
        comment_text = row['cleaned_text']
        author_name = row['author_name']

        if author_name in user_counts.index and user_counts[author_name] > 3:
            # print(f">>> 正在檢查 {author_name} 的留言")

            if author_name in [list(item.keys())[0] for item in name_count_list]:
                duplicate_comments = df[df['author_name'] == author_name]['cleaned_text'].value_counts()
                
                if duplicate_comments[comment_text] > 3:
                    df.at[idx, 'spam_tag'] = 'spam'
                    # print(f">>> username: {author_name} 重複留言超過3次，標記為spam")

        if len(comment_text) < 5:
            df.at[idx, 'spam_tag'] = 'spam'
            # print(f"{comment_text} >>> 留言過短，標記為spam")
    
    return df 

In [None]:
# 初始化 count_spam 字典來記錄每個檔案的過濾統計
count_spam = {}

for i in range(0, 36):
    if i in [10, 11, 12, 16, 17]:
        print(f"跳過 video_{i}")
        continue

    print(f"\n=== 處理 video_{i} ===")
    file_name = f'for_bert/video_{i}_ckip_cleaned.csv'
    df = pd.read_csv(file_name, encoding='utf-8')
    original_count = len(df)
    print(f"一共有 {original_count} 筆留言")

    tag_df = count_spammer_comments(df)
    
    filtered_df = df[df['spam_tag'] != 'spam'][['video_title', 'cleaned_text']]
    filtered_count = len(filtered_df)
    print(f"過濾後剩下 {filtered_count} 筆留言")

    # 記錄這個檔案的統計資訊
    count_spam[f'video_{i}'] = {
        'original_count': original_count,
        'filtered_count': filtered_count,
        'spam_count': original_count - filtered_count
    }

    # save to csv
    filtered_df.to_csv(f'for_gpt_tag/video_{i}_filtered_spam.csv', index=False)

# 將統計結果保存為 DataFrame 並輸出到 CSV
spam_stats_df = pd.DataFrame.from_dict(count_spam, orient='index')
spam_stats_df.index.name = 'video_file'
spam_stats_df.to_csv('for_gpt_tag/spam_filter_stats.csv')

# 顯示總計
total_original = spam_stats_df['original_count'].sum()
total_filtered = spam_stats_df['filtered_count'].sum()
print("\n=== 總計 ===")
print(f"總原始留言數: {total_original}")
print(f"過濾後留言數: {total_filtered}")
print(f"過濾掉的留言數: {total_original - total_filtered}")
print(f"總過濾比例: {(total_original - total_filtered) / total_original:.4f}")


=== 處理 video_0 ===
一共有 1948 筆留言
>>> 留言數量大於 3 的使用者： [{'1taichang': 26}, {'翡翠-c9q': 23}, {'哈哈-z1p': 22}, {'image1191': 19}, {'patrickyang5760': 15}, {'gravitywide': 13}, {'alstonc.8964': 12}, {'user-fukakai': 11}, {'wallowes': 9}, {'rayrayrayray7630': 9}, {'UncleJigen': 9}, {'陳志慈-b6g': 8}, {'allen747': 7}, {'kycia12': 6}, {'kevinfan8020': 6}, {'Chou-d8u': 6}, {'蕭秉科-c1r': 5}, {'超營養雞排-o3e': 5}, {'alicelin661': 5}, {'烟一缕': 4}, {'jtang4378': 4}, {'楊繼芬': 4}, {'kokkeonglai2581': 4}, {'香吉士-q1n': 4}, {'推翻共產黨共產黨才是': 4}, {'0鍇': 4}, {'adsmk22': 4}, {'joyoungjin-yc8sb': 4}, {'bobobo0413': 4}]
過濾後剩下 1699 筆留言

=== 處理 video_1 ===
一共有 13270 筆留言
>>> 留言數量大於 3 的使用者： [{'陳小名-r6g': 121}, {'尋思-l7u': 72}, {'Avan-Golf': 65}, {'Alex-q9o1q': 64}, {'kennyko7730': 57}, {'gto-iu1qq': 51}, {'image1191': 50}, {'Fm-qu3ts': 47}, {'GodBlessedyumei': 45}, {'ssf846': 39}, {'mercy204': 37}, {'陳霖涵-r2n': 37}, {'海軍會暈船': 36}, {'kwokgary989': 35}, {'hu5916': 33}, {'Ernst8963': 29}, {'canonhuang4001': 28}, {'473834': 26}, {'bobob

### 人工標注spam/non-spam
spam 的標準：

- 留言過短 （少於5個字）
- 無意義的內容 
    - e.g. 哈哈哈哈哈哈哈、Albee真的好強...、你講支語喔、贺龙好样的
    - 無指涉（缺少主詞） e.g 這集超好笑 真的超有梗、真的很喜歡夜夜秀這個節目 辛苦了
    - 留言內或留言串中內容一直重複
    - 只有批評或讚美 e.g. 所以才是腦殘阿 根本就一昧的跟從 傻B


In [14]:
"""
人工標注的資料是從每一部影片過濾spammer之後，隨機抽取100則留言組成
"""

import pandas as pd
import os

comments_for_tag = []
stats_data = []  # 用來記錄統計資料

# 修改這裡：處理所有影片
for i in range(0, 36):
    if i in [10, 11, 12, 16, 17]:
        print(f"跳過 video_{i}")
        continue

    file_path = f"spam_tag/video_{i}_ckip_spam_tag.csv"
    # file_path = f"hello_comments/for_bert/video_{i}_ckip_cleaned.csv"
    print(f"Processing video {i} from {file_path}")
    
    # 檢查檔案是否存在
    if not os.path.exists(file_path):
        print(f"  檔案不存在，跳過")
        continue
    
    temp_df = pd.read_csv(file_path)
    original_comment_num = len(temp_df)
    video_title = temp_df['video_title'].iloc[0] if len(temp_df) > 0 else f"Video_{i}"
    
    print(f"  一共有 {original_comment_num} 筆留言")

    # 按比例抽取 2% 的留言，但最少抽100則
    sample_ratio = 0.02  
    calculated_sample_size = max(100, int(original_comment_num * sample_ratio))
    
    # 確保不超過原始留言數
    sample_size = min(calculated_sample_size, original_comment_num)
    
    sampled_comments = temp_df.sample(n=sample_size, random_state=42)
    comments_for_tag.append(sampled_comments)
    
    # 記錄統計資料（每部影片都會append）
    stats_data.append({
        'video_id': i,
        'video_title': video_title,
        'original_comment_num': original_comment_num,
        'tag_comments_num': len(sampled_comments)
    })

    print(f"  抽取了 {len(sampled_comments)} 則留言")

# 建立統計DataFrame
stats_df = pd.DataFrame(stats_data)
print("\n=== 影片留言統計 ===")
print(stats_df)

# 確保有收集到資料
if comments_for_tag:
    tag_comments_all = pd.concat(comments_for_tag, ignore_index=True)
    tag_comments_all = tag_comments_all.reset_index(drop=True)
    tag_comments_all['video_id'] = tag_comments_all['video_title'].map(
        {title: f"v{idx}" for idx, title in enumerate(tag_comments_all['video_title'].unique())}
    )

    tag_comments_all = tag_comments_all[['video_id','cleaned_text','spam_tag']]

    # 儲存抽取的留言
    tag_comments_all.to_csv("spam_tag/comments_spam_tag.csv", index=False)
    print(f"\n抽取的留言已儲存到 spam_tag/comments_spam_tag.csv with {len(tag_comments_all)} comments")
    
    # 修正這裡：完整的方法名稱
    stats_df.to_csv("spam_tag/video_sampling_stats.csv", index=False)
    print(f"統計資料已儲存到 spam_tag/video_sampling_stats.csv")
    
else:
    print("沒有找到任何資料可供處理")

# 顯示最終統計
print(f"\n=== 總計 ===")
print(f"總影片數: {len(stats_df)}")
print(f"總原始留言數: {stats_df['original_comment_num'].sum()}")
print(f"總抽取留言數: {stats_df['tag_comments_num'].sum()}")
print(f"抽取比例: {stats_df['tag_comments_num'].sum() / stats_df['original_comment_num'].sum():.4f}")

Processing video 0 from spam_tag/video_0_ckip_spam_tag.csv
  一共有 1948 筆留言
  抽取了 100 則留言
Processing video 1 from spam_tag/video_1_ckip_spam_tag.csv
  一共有 13270 筆留言
  抽取了 265 則留言
Processing video 2 from spam_tag/video_2_ckip_spam_tag.csv
  一共有 1513 筆留言
  抽取了 100 則留言
Processing video 3 from spam_tag/video_3_ckip_spam_tag.csv
  一共有 1099 筆留言
  抽取了 100 則留言
Processing video 4 from spam_tag/video_4_ckip_spam_tag.csv
  一共有 989 筆留言
  抽取了 100 則留言
Processing video 5 from spam_tag/video_5_ckip_spam_tag.csv
  一共有 1401 筆留言
  抽取了 100 則留言
Processing video 6 from spam_tag/video_6_ckip_spam_tag.csv
  一共有 1161 筆留言
  抽取了 100 則留言
Processing video 7 from spam_tag/video_7_ckip_spam_tag.csv
  一共有 2135 筆留言
  抽取了 100 則留言
Processing video 8 from spam_tag/video_8_ckip_spam_tag.csv
  一共有 1942 筆留言
  抽取了 100 則留言
Processing video 9 from spam_tag/video_9_ckip_spam_tag.csv
  一共有 1242 筆留言
  抽取了 100 則留言
跳過 video_10
跳過 video_11
跳過 video_12
Processing video 13 from spam_tag/video_13_ckip_spam_tag.csv
  一共有 6783 筆留言
  抽取了 13