In [None]:
# Make sure you have the latest version of the SDK available to use the Batch API
# %pip install openai --upgrade

In [1]:
import json
from openai import OpenAI
import pandas as pd
from IPython.display import Image, display
from dotenv import load_dotenv

In [2]:
# load openai API key from .env
load_dotenv()

True

In [3]:
# Initializing OpenAI client - see https://platform.openai.com/docs/quickstart?context=python
client = OpenAI()

# Prepare the data for the batch API

In [91]:
# 合併所有的cleaned_text和video_title，準備作為批次上傳的data
"""
要分成三批
1. v0-v15 跳過10, 11, 12, 16, 17
2. v18-v35 跳過v30, 31, 32
3. v30-31
"""

all_comments = []

for i in range(18, 36):
    # if i in [10, 11, 12, 16, 17]:
    #     print(f"跳過 video_{i}")
    #     continue

    if i in [30, 31, 32]:
        print(f"跳過 video_{i}")
        continue
    
    file_path = f"for_gpt_tag/video_{i}_filtered_spam.csv"
    # file_path = f"spam_tag/comments_spam_tag.csv"
    df = pd.read_csv(file_path, encoding='utf-8-sig')

    for index, row in df.iterrows():
        # print(f"Processing row {index + 1} of {len(df)}")
        video_title = row['video_title']
        cleaned_text = row['cleaned_text']
        all_comments.append({
            "video_title": video_title,
            "comment": cleaned_text
        })

# Save the combined data to a JSON file
with open('for_gpt_tag/ver2_comments_for_tag.json', 'w', encoding='utf-8') as f:
    json.dump(all_comments, f, ensure_ascii=False, indent=4)

跳過 video_30
跳過 video_31
跳過 video_32


In [92]:
# check length of all_comments
print(f"Total comments processed: {len(all_comments)}")

Total comments processed: 44249


## 從instruction2之後只要讀檔

1. for_gpt_tag/ver1_comments_for_tag.json
2. for_gpt_tag/ver2_comments_for_tag.json
3. for_gpt_tag/ver3_comments_for_tag.json

In [4]:
i = 3
with open(f'for_gpt_tag/ver{i}_comments_for_tag.json', 'r', encoding='utf-8') as f:
    all_comments = json.load(f)

print(len(all_comments))

46696


In [5]:
# set up batch input
"""
分別上傳instruction1, instruction2, instruction3的batch_input

custom_id: instruction"1"-task-0, instruction"2"-task-1, instruction"3"-task-2
messages:[
{
    "role": "system", "content": instruction"1"
},
{
    "role": "user", "content": f"【影片標題】{item['video_title']}\n【留言內容】{item['comment']}"
}
]
    
"""
from gpt_instructions import *

tasks = []

for index, item in enumerate(all_comments):
    task = {
        "custom_id": f"instruction3-task-{index+86284}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-mini",
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system", "content": instruction2
                },
                {
                    "role": "user", "content":
                        f"【影片標題】{item['video_title']}\n【留言內容】{item['comment']}"
                }
            ],
            "temperature": 0
        }
    }
    tasks.append(task)

In [6]:
# Creating the file
file_name = "batch_input/instruction2_ver3_comments.jsonl"

with open(file_name, 'w', encoding='utf-8') as file:
    for obj in tasks:
        file.write(json.dumps(obj, ensure_ascii=False) + '\n')

# 檢查當前所有Batch

In [7]:
# 檢查所有的batch jobs
client.batches.list()

SyncCursorPage[Batch](data=[Batch(id='batch_684bf6bbee048190819505b124197ea4', completion_window='24h', created_at=1749808827, endpoint='/v1/chat/completions', input_file_id='file-3nxJghe61tFku5zu4uYiNm', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749895227, failed_at=None, finalizing_at=None, in_progress_at=1749808849, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=44249)), Batch(id='batch_684bf5a6e3c88190bb3852e848859c35', completion_window='24h', created_at=1749808550, endpoint='/v1/chat/completions', input_file_id='file-4FC84SMm2N6bXJXxDZuwYv', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749894950, failed_at=None, finalizing_at=None, in_progress_at=1749808562, metadata=None, output_file_id=None, request_co

# Upload batch input (jsonl)

In [8]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [9]:
print(batch_file)

FileObject(id='file-DW9i831EMTas99HRpaatgs', bytes=79142333, created_at=1749808952, filename='instruction2_ver3_comments.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


# Create Batch

In [10]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Check Batch Status

In [11]:
batch = client.batches.retrieve(batch_job.id)
print(batch)

Batch(id='batch_684bf76e1a888190b5785faaffda5a0a', completion_window='24h', created_at=1749809006, endpoint='/v1/chat/completions', input_file_id='file-DW9i831EMTas99HRpaatgs', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749895406, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


### Check Batch Progress

In [12]:
import time

def check_batch_status(batch_id):
    """檢查批次處理狀態"""
    batch = client.batches.retrieve(batch_id)
    print(f"批次狀態: {batch.status}")
    print(f"已完成: {batch.request_counts.completed}")
    print(f"失敗: {batch.request_counts.failed}")
    print(f"總計: {batch.request_counts.total}")
    return batch

# 定期檢查狀態
# batch_id = "batch_684bf30244cc8190bfce0d1a6783a3ee"

batch_id = batch_job.id
print(f"批次ID: {batch_id}")

current_batch = check_batch_status(batch_id)

批次ID: batch_684bf76e1a888190b5785faaffda5a0a
批次狀態: in_progress
已完成: 0
失敗: 0
總計: 46696


# Cancel Batch

In [8]:
client.batches.cancel("batch_684bf30244cc8190bfce0d1a6783a3ee")

Batch(id='batch_684bf30244cc8190bfce0d1a6783a3ee', completion_window='24h', created_at=1749807874, endpoint='/v1/chat/completions', input_file_id='file-NseY25fx744Upm9RzYxNBt', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1749808410, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749894274, failed_at=None, finalizing_at=None, in_progress_at=1749807884, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=1401, failed=0, total=46696))

# View ourput result

In [None]:
batch = client.batches.retrieve(batch_job.id)
print(batch)

In [115]:
# 當批次狀態為 'completed' 時，下載結果
if batch.status == 'completed' and batch.output_file_id:
    print("批次已完成，開始下載結果...")
    
    # 下載輸出文件
    file_response = client.files.content(batch.output_file_id)
    output_content = file_response.content.decode('utf-8')
    
    # 保存結果到本地文件
    with open('batch_results.jsonl', 'w', encoding='utf-8') as f:
        f.write(output_content)
    
    print("結果已保存到 batch_results.jsonl")
    
    # 查看前幾個結果
    
    results = []
    for line in output_content.strip().split('\n'):
        if line:
            result = json.loads(line)
            results.append(result)
    
    print(f"總共收到 {len(results)} 個結果")
    
    # 顯示前3個結果
    for i, result in enumerate(results[:3]):
        print(f"\n=== 結果 {i+1} ===")
        print(f"Custom ID: {result['custom_id']}")
        if result.get('response'):
            content = result['response']['body']['choices'][0]['message']['content']
            print(f"GPT 回應: {content[:200]}...")  # 只顯示前200字符
        elif result.get('error'):
            print(f"錯誤: {result['error']}")

else:
    print(f"批次尚未完成，當前狀態: {batch.status}")
    if batch.status == 'in_progress':
        progress = (batch.request_counts.completed / batch.request_counts.total) * 100
        print(f"進度: {progress:.1f}%")

批次已完成，開始下載結果...
結果已保存到 batch_results.jsonl
總共收到 6498 個結果

=== 結果 1 ===
Custom ID: task-0
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容為一句話，雖然簡短但有指涉特定對象，非完全無意義。",
    "tag2": false,
    "reason2": "留言雖然簡短，但並非無實質內容或無法提供有價值資訊，可能是評論或觀點。",
    "tag3": true,
    "reason3": "留言少於5個字，符合無意義留言指標1。"
}...

=== 結果 2 ===
Custom ID: task-1
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容有具體評論，表達對節目變化的看法，具備實質內容。",
    "tag2": false,
    "reason2": "留言提供了對節目內容的觀點，屬於有價值的討論，不屬於無意義留言。",
    "tag3": false,
    "reason3": "留言字數超過5字，內容有意義，且無重複或強烈情緒性用語...

=== 結果 3 ===
Custom ID: task-2
GPT 回應: {
    "tag1": false,
    "reason1": "留言內容有具體政治議題討論，非無意義文字。",
    "tag2": false,
    "reason2": "留言提供了關於柯文哲及政治情勢的看法，具備實質內容與討論價值。",
    "tag3": false,
    "reason3": "留言字數超過5字，內容有意義，無重複，情緒性不強，符合有意義留言標準。...


In [None]:
# 修改版本：展開 GPT 回應為獨立欄位
import os

def process_batch_results():
    if not os.path.exists('batch_results.jsonl'):
        print("結果文件不存在，請先下載結果")
        return
    
    results_data = []
    
    with open('batch_results.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                result = json.loads(line)
                
                # 提取有用信息
                custom_id = result['custom_id']
                task_index = int(custom_id.replace('task-', ''))
                
                if result.get('response'):
                    gpt_response = result['response']['body']['choices'][0]['message']['content']
                    
                    # 嘗試解析 JSON 回應
                    try:
                        gpt_data = json.loads(gpt_response)
                        
                        # 基本資料
                        row_data = {
                            'task_index': task_index,
                            'video_title': all_comments[task_index]['video_title'],
                            'comment': all_comments[task_index]['comment'],
                            'status': 'success'
                        }
                        
                        # 展開 GPT 回應為獨立欄位
                        row_data['tag1'] = gpt_data.get('tag1', None)
                        row_data['reason1'] = gpt_data.get('reason1', '')
                        row_data['tag2'] = gpt_data.get('tag2', None)
                        row_data['reason2'] = gpt_data.get('reason2', '')
                        row_data['tag3'] = gpt_data.get('tag3', None)
                        row_data['reason3'] = gpt_data.get('reason3', '')
                        
                        results_data.append(row_data)
                        
                    except json.JSONDecodeError:
                        results_data.append({
                            'task_index': task_index,
                            'video_title': all_comments[task_index]['video_title'],
                            'comment': all_comments[task_index]['comment'],
                            'gpt_response_raw': gpt_response,
                            'status': 'json_error',
                            'tag1': None, 'reason1': '',
                            'tag2': None, 'reason2': '',
                            'tag3': None, 'reason3': ''
                        })
                elif result.get('error'):
                    results_data.append({
                        'task_index': task_index,
                        'video_title': all_comments[task_index]['video_title'],
                        'comment': all_comments[task_index]['comment'],
                        'error': result['error'],
                        'status': 'error',
                        'tag1': None, 'reason1': '',
                        'tag2': None, 'reason2': '',
                        'tag3': None, 'reason3': ''
                    })
    
    # 轉換為 DataFrame
    results_df = pd.DataFrame(results_data)
    
    # 重新排序欄位
    column_order = [
        'task_index', 'video_title', 'comment',
        'tag1', 'reason1', 'tag2', 'reason2', 'tag3', 'reason3',
        'status'
    ]
    
    # 只選擇存在的欄位
    existing_columns = [col for col in column_order if col in results_df.columns]
    results_df = results_df[existing_columns]
    
    # 保存到 CSV
    results_df.to_csv('batch_analysis_results.csv', index=False, encoding='utf-8')
    
    print(f"處理完成！共 {len(results_df)} 筆結果")
    print(f"成功: {len(results_df[results_df['status'] == 'success'])}")
    print(f"錯誤: {len(results_df[results_df['status'] != 'success'])}")
    
    # 顯示前幾筆結果
    print("\n=== 前5筆結果預覽 ===")
    for idx, row in results_df.head().iterrows():
        print(f"\n第 {idx+1} 筆:")
        print(f"留言: {row['comment'][:50]}...")
        print(f"tag1: {row['tag1']}, reason1: {row['reason1'][:30]}...")
        print(f"tag2: {row['tag2']}, reason2: {row['reason2'][:30]}...")
        print(f"tag3: {row['tag3']}, reason3: {row['reason3'][:30]}...")
    
    return results_df

# 當批次完成後執行
df_results = process_batch_results()

處理完成！共 6498 筆結果
成功: 6498
錯誤: 0

=== 前5筆結果預覽 ===

第 1 筆:
留言: 太有活了贵司...
tag1: False, reason1: 留言內容為一句話，雖然簡短但有指涉特定對象，非完全無意義。...
tag2: False, reason2: 留言雖然簡短，但並非無實質內容或無法提供有價值資訊，可能是評...
tag3: True, reason3: 留言少於5個字，符合無意義留言指標1。...

第 2 筆:
留言: 我的天这段影片的评论区是发生了什么...
tag1: False, reason1: 留言內容有具體評論，表達對節目變化的看法，具備實質內容。...
tag2: False, reason2: 留言提供了對節目內容的觀點，屬於有價值的討論，不屬於無意義留...
tag3: False, reason3: 留言字數超過5字，內容有意義，且無重複或強烈情緒性用語，不符...

第 3 筆:
留言: 原来这就是之后节目里提到的被炎上的事件之一吗...
tag1: False, reason1: 留言內容有具體政治議題討論，非無意義文字。...
tag2: False, reason2: 留言提供了關於柯文哲及政治情勢的看法，具備實質內容與討論價值...
tag3: False, reason3: 留言字數超過5字，內容有意義，無重複，情緒性不強，符合有意義...

第 4 筆:
留言: 突然發現低卡的夥伴們ᶘ ᵒᴥᵒᶅ...
tag1: False, reason1: 留言內容有明確表達對節目的喜愛及期待，具備實質內容。...
tag2: False, reason2: 留言提供了對節目內容的回顧與期待，屬於有價值的討論。...
tag3: False, reason3: 留言字數超過5字，內容有意義，且無重複或強烈情緒性語言。...

第 5 筆:
留言: 怎麼選舉可以有這麼多好笑的內容...
tag1: True, reason1: 留言內容簡短且無實質討論內容，僅為索取個人社交媒體資訊。...
tag2: True, reason2: 留言無法提供有價值的資訊或討論，屬於無意義留言範例。...
tag3: True, reason3: 留言少於5個字，且內容無意義，符合無意義留言指標。...
