In [None]:
# Make sure you have the latest version of the SDK available to use the Batch API
# %pip install openai --upgrade

In [10]:
import json
from openai import OpenAI
import pandas as pd
from IPython.display import Image, display
from dotenv import load_dotenv


In [None]:
# load openai API key from .env
load_dotenv()

True

In [3]:
# Initializing OpenAI client - see https://platform.openai.com/docs/quickstart?context=python
client = OpenAI()

# Prepare the data for the batch API

In [118]:
# 合併所有的cleaned_text和video_title，準備作為批次上傳的data

all_comments = []

# for i in [7, 23, 33]:  ## test
for i in range(0, 36):
    if i in [10, 11, 12, 16, 17]:
        print(f"Skipping video {i}")
        continue

    print(f"==> Processing video {i}")
    # Read the CSV file
    raw_data = pd.read_csv(f"hello_comments/spam_tag/video_{i}_ckip_spam_tag.csv", encoding='utf-8')
    for idx, row in raw_data.iterrows():
        if row['spam_tag'] == 'spam':
            print(f"Skipping spam comment at index {idx}")
            continue
        video_title = row['video_title']
        cleaned_text = row['cleaned_text']
        raw_data.at[idx, 'cleaned_text'] = cleaned_text.replace('\n', ' ').replace('\r', ' ')
    df = raw_data[['video_title', 'cleaned_text']]

    for index, row in df.iterrows():
        # print(f"Processing row {index + 1} of {len(df)}")
        video_title = row['video_title']
        cleaned_text = row['cleaned_text']
        all_comments.append({
            "video_title": video_title,
            "comment": cleaned_text
        })

# Save the combined data to a JSON file
with open('hello_comments/spam_tag/all_comments.json', 'w', encoding='utf-8') as f:
    json.dump(all_comments, f, ensure_ascii=False, indent=4)

==> Processing video 0
Skipping spam comment at index 5
Skipping spam comment at index 10
Skipping spam comment at index 14
Skipping spam comment at index 15
Skipping spam comment at index 22
Skipping spam comment at index 24
Skipping spam comment at index 30
Skipping spam comment at index 31
Skipping spam comment at index 59
Skipping spam comment at index 60
Skipping spam comment at index 67
Skipping spam comment at index 87
Skipping spam comment at index 88
Skipping spam comment at index 89
Skipping spam comment at index 90
Skipping spam comment at index 91
Skipping spam comment at index 96
Skipping spam comment at index 98
Skipping spam comment at index 101
Skipping spam comment at index 103
Skipping spam comment at index 105
Skipping spam comment at index 107
Skipping spam comment at index 108
Skipping spam comment at index 111
Skipping spam comment at index 118
Skipping spam comment at index 121
Skipping spam comment at index 124
Skipping spam comment at index 128
Skipping spam co

In [119]:
# check length of all_comments
print(f"Total comments processed: {len(all_comments)}")

Total comments processed: 146118


In [120]:
# set up batch input
from gpt_instructions import instruction_combine

tasks = []

for index, item in enumerate(all_comments):
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-mini",
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system", "content": instruction_combine
                },
                {
                    "role": "user", "content":
                        f"【影片標題】{item['video_title']}\n【留言內容】{item['comment']}"
                }
            ],
            "temperature": 0
        }
    }
    tasks.append(task)

In [121]:
# Creating the file
file_name = "batch_useless_allComments.jsonl"

with open(file_name, 'w', encoding='utf-8') as file:
    for obj in tasks:
        file.write(json.dumps(obj, ensure_ascii=False) + '\n')

# 檢查當前所有Batch

In [123]:
# 檢查所有的batch jobs
client.batches.list()

SyncCursorPage[Batch](data=[Batch(id='batch_684ae93fe98c8190b9da43abc08ca485', completion_window='24h', created_at=1749739839, endpoint='/v1/chat/completions', input_file_id='file-C49vm8dPccpj9hDTep8ai7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1749742196, error_file_id=None, errors=None, expired_at=None, expires_at=1749826239, failed_at=None, finalizing_at=1749741054, in_progress_at=1749739872, metadata=None, output_file_id='file-GtM9TrQt1x2LgpCRa8ymQH', request_counts=BatchRequestCounts(completed=6498, failed=0, total=6498)), Batch(id='batch_684ae557d5cc8190914795a7c9c6a825', completion_window='24h', created_at=1749738839, endpoint='/v1/chat/completions', input_file_id='file-CcRtv9dM9NEzAUAdNSbpAr', object='batch', status='cancelled', cancelled_at=1749740633, cancelling_at=1749739405, completed_at=None, error_file_id='file-3GeGdfFYRgS5hbqUFW2TYh', errors=None, expired_at=None, expires_at=1749825239, failed_at=None, finalizing_at=None, i

# Upload batch input (jsonl)

#### ===> 已經做好all_comments（共146118筆留言）左右的資料，從這裡執行
- video 10, 11, 12, 16, 17 是不處理的級數 -> 排除

In [58]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [60]:
print(batch_file)

FileObject(id='file-C49vm8dPccpj9hDTep8ai7', bytes=14847737, created_at=1749739749, filename='batch_useless_test.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


# Create Batch

In [61]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Check Batch Status

In [114]:
batch = client.batches.retrieve(batch_job.id)
print(batch)

Batch(id='batch_684ae93fe98c8190b9da43abc08ca485', completion_window='24h', created_at=1749739839, endpoint='/v1/chat/completions', input_file_id='file-C49vm8dPccpj9hDTep8ai7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1749742196, error_file_id=None, errors=None, expired_at=None, expires_at=1749826239, failed_at=None, finalizing_at=1749741054, in_progress_at=1749739872, metadata=None, output_file_id='file-GtM9TrQt1x2LgpCRa8ymQH', request_counts=BatchRequestCounts(completed=6498, failed=0, total=6498))


### Check Batch Progress

In [112]:
import time

def check_batch_status(batch_id):
    """檢查批次處理狀態"""
    batch = client.batches.retrieve(batch_id)
    print(f"批次狀態: {batch.status}")
    print(f"已完成: {batch.request_counts.completed}")
    print(f"失敗: {batch.request_counts.failed}")
    print(f"總計: {batch.request_counts.total}")
    return batch

# 定期檢查狀態
batch_id = "batch_684ae93fe98c8190b9da43abc08ca485"
current_batch = check_batch_status(batch_id)

批次狀態: completed
已完成: 6498
失敗: 0
總計: 6498


# Cancel Batch

In [None]:
client.batches.cancel(batch_id)

Batch(id='batch_684ae557d5cc8190914795a7c9c6a825', completion_window='24h', created_at=1749738839, endpoint='/v1/chat/completions', input_file_id='file-CcRtv9dM9NEzAUAdNSbpAr', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1749739405, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749825239, failed_at=None, finalizing_at=None, in_progress_at=1749738901, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=3059, total=6498))

# View ourput result

In [None]:
batch = client.batches.retrieve(batch_job.id)
print(batch)

In [115]:
# 當批次狀態為 'completed' 時，下載結果
if batch.status == 'completed' and batch.output_file_id:
    print("批次已完成，開始下載結果...")
    
    # 下載輸出文件
    file_response = client.files.content(batch.output_file_id)
    output_content = file_response.content.decode('utf-8')
    
    # 保存結果到本地文件
    with open('batch_results.jsonl', 'w', encoding='utf-8') as f:
        f.write(output_content)
    
    print("結果已保存到 batch_results.jsonl")
    
    # 查看前幾個結果
    
    results = []
    for line in output_content.strip().split('\n'):
        if line:
            result = json.loads(line)
            results.append(result)
    
    print(f"總共收到 {len(results)} 個結果")
    
    # 顯示前3個結果
    for i, result in enumerate(results[:3]):
        print(f"\n=== 結果 {i+1} ===")
        print(f"Custom ID: {result['custom_id']}")
        if result.get('response'):
            content = result['response']['body']['choices'][0]['message']['content']
            print(f"GPT 回應: {content[:200]}...")  # 只顯示前200字符
        elif result.get('error'):
            print(f"錯誤: {result['error']}")

else:
    print(f"批次尚未完成，當前狀態: {batch.status}")
    if batch.status == 'in_progress':
        progress = (batch.request_counts.completed / batch.request_counts.total) * 100
        print(f"進度: {progress:.1f}%")

批次已完成，開始下載結果...
結果已保存到 batch_results.jsonl
總共收到 6498 個結果

=== 結果 1 ===
Custom ID: task-0
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容為一句話，雖然簡短但有指涉特定對象，非完全無意義。",
    "tag2": false,
    "reason2": "留言雖然簡短，但並非無實質內容或無法提供有價值資訊，可能是評論或觀點。",
    "tag3": true,
    "reason3": "留言少於5個字，符合無意義留言指標1。"
}...

=== 結果 2 ===
Custom ID: task-1
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容有具體評論，表達對節目變化的看法，具備實質內容。",
    "tag2": false,
    "reason2": "留言提供了對節目內容的觀點，屬於有價值的討論，不屬於無意義留言。",
    "tag3": false,
    "reason3": "留言字數超過5字，內容有意義，且無重複或強烈情緒性用語...

=== 結果 3 ===
Custom ID: task-2
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容有具體政治議題討論，非無意義文字。",
    "tag2": false,
    "reason2": "留言提供了關於柯文哲及政治情勢的看法，具備實質內容與討論價值。",
    "tag3": false,
    "reason3": "留言字數超過5字，內容有意義，無重複，情緒性不強，符合有意義留言標準。...


In [None]:
# 修改版本：展開 GPT 回應為獨立欄位
import os

def process_batch_results():
    if not os.path.exists('batch_results.jsonl'):
        print("結果文件不存在，請先下載結果")
        return
    
    results_data = []
    
    with open('batch_results.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                result = json.loads(line)
                
                # 提取有用信息
                custom_id = result['custom_id']
                task_index = int(custom_id.replace('task-', ''))
                
                if result.get('response'):
                    gpt_response = result['response']['body']['choices'][0]['message']['content']
                    
                    # 嘗試解析 JSON 回應
                    try:
                        gpt_data = json.loads(gpt_response)
                        
                        # 基本資料
                        row_data = {
                            'task_index': task_index,
                            'video_title': all_comments[task_index]['video_title'],
                            'comment': all_comments[task_index]['comment'],
                            'status': 'success'
                        }
                        
                        # 展開 GPT 回應為獨立欄位
                        row_data['tag1'] = gpt_data.get('tag1', None)
                        row_data['reason1'] = gpt_data.get('reason1', '')
                        row_data['tag2'] = gpt_data.get('tag2', None)
                        row_data['reason2'] = gpt_data.get('reason2', '')
                        row_data['tag3'] = gpt_data.get('tag3', None)
                        row_data['reason3'] = gpt_data.get('reason3', '')
                        
                        results_data.append(row_data)
                        
                    except json.JSONDecodeError:
                        results_data.append({
                            'task_index': task_index,
                            'video_title': all_comments[task_index]['video_title'],
                            'comment': all_comments[task_index]['comment'],
                            'gpt_response_raw': gpt_response,
                            'status': 'json_error',
                            'tag1': None, 'reason1': '',
                            'tag2': None, 'reason2': '',
                            'tag3': None, 'reason3': ''
                        })
                elif result.get('error'):
                    results_data.append({
                        'task_index': task_index,
                        'video_title': all_comments[task_index]['video_title'],
                        'comment': all_comments[task_index]['comment'],
                        'error': result['error'],
                        'status': 'error',
                        'tag1': None, 'reason1': '',
                        'tag2': None, 'reason2': '',
                        'tag3': None, 'reason3': ''
                    })
    
    # 轉換為 DataFrame
    results_df = pd.DataFrame(results_data)
    
    # 重新排序欄位
    column_order = [
        'task_index', 'video_title', 'comment',
        'tag1', 'reason1', 'tag2', 'reason2', 'tag3', 'reason3',
        'status'
    ]
    
    # 只選擇存在的欄位
    existing_columns = [col for col in column_order if col in results_df.columns]
    results_df = results_df[existing_columns]
    
    # 保存到 CSV
    results_df.to_csv('batch_analysis_results.csv', index=False, encoding='utf-8')
    
    print(f"處理完成！共 {len(results_df)} 筆結果")
    print(f"成功: {len(results_df[results_df['status'] == 'success'])}")
    print(f"錯誤: {len(results_df[results_df['status'] != 'success'])}")
    
    # 顯示前幾筆結果
    print("\n=== 前5筆結果預覽 ===")
    for idx, row in results_df.head().iterrows():
        print(f"\n第 {idx+1} 筆:")
        print(f"留言: {row['comment'][:50]}...")
        print(f"tag1: {row['tag1']}, reason1: {row['reason1'][:30]}...")
        print(f"tag2: {row['tag2']}, reason2: {row['reason2'][:30]}...")
        print(f"tag3: {row['tag3']}, reason3: {row['reason3'][:30]}...")
    
    return results_df

# 當批次完成後執行
df_results = process_batch_results()

處理完成！共 6498 筆結果
成功: 6498
錯誤: 0

=== 前5筆結果預覽 ===

第 1 筆:
留言: 太有活了贵司...
tag1: False, reason1: 留言內容為一句話，雖然簡短但有指涉特定對象，非完全無意義。...
tag2: False, reason2: 留言雖然簡短，但並非無實質內容或無法提供有價值資訊，可能是評...
tag3: True, reason3: 留言少於5個字，符合無意義留言指標1。...

第 2 筆:
留言: 我的天这段影片的评论区是发生了什么...
tag1: False, reason1: 留言內容有具體評論，表達對節目變化的看法，具備實質內容。...
tag2: False, reason2: 留言提供了對節目內容的觀點，屬於有價值的討論，不屬於無意義留...
tag3: False, reason3: 留言字數超過5字，內容有意義，且無重複或強烈情緒性用語，不符...

第 3 筆:
留言: 原来这就是之后节目里提到的被炎上的事件之一吗...
tag1: False, reason1: 留言內容有具體政治議題討論，非無意義文字。...
tag2: False, reason2: 留言提供了關於柯文哲及政治情勢的看法，具備實質內容與討論價值...
tag3: False, reason3: 留言字數超過5字，內容有意義，無重複，情緒性不強，符合有意義...

第 4 筆:
留言: 突然發現低卡的夥伴們ᶘ ᵒᴥᵒᶅ...
tag1: False, reason1: 留言內容有明確表達對節目的喜愛及期待，具備實質內容。...
tag2: False, reason2: 留言提供了對節目內容的回顧與期待，屬於有價值的討論。...
tag3: False, reason3: 留言字數超過5字，內容有意義，且無重複或強烈情緒性語言。...

第 5 筆:
留言: 怎麼選舉可以有這麼多好笑的內容...
tag1: True, reason1: 留言內容簡短且無實質討論內容，僅為索取個人社交媒體資訊。...
tag2: True, reason2: 留言無法提供有價值的資訊或討論，屬於無意義留言範例。...
tag3: True, reason3: 留言少於5個字，且內容無意義，符合無意義留言指標。...
