<a href="https://colab.research.google.com/github/ychoi-kr/llm-api-prog/blob/main/4_openai/openai_batch_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenAI Batch API를 활용한 감성 분석

In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.37.1-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.37.1-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-an

In [2]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

## 단일 샘플 감성 분석 테스트

In [3]:
from openai import OpenAI

# OpenAI 클라이언트 초기화
client = OpenAI()

# 분석할 리뷰
review = "이 영화는 정말 재미있고 감동적이에요!"

# OpenAI API를 사용한 감성 분석 요청
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Analyze the sentiment of the following movie review and categorize it strictly as 1 for positive or 0 for negative without providing any explanation or reasoning."},
        {"role": "user", "content": "핵노잼"},
        {"role": "assistant", "content": "0"},
        {"role": "user", "content": "개꿀잼"},
        {"role": "assistant", "content": "1"},
        {"role": "user", "content": review}
    ],
    max_tokens=60
)

# 예측 결과 출력
message = response.choices[0].message.content.strip()
print(f"Review: {review}")
print(f"Sentiment: {message}")


Review: 이 영화는 정말 재미있고 감동적이에요!
Sentiment: 1


## 데이터 준비

In [4]:
import json
import pandas as pd
import urllib.request

In [5]:
# 데이터 다운로드(Naver sentiment movie corpus v1.0)
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7f3f72d4f130>)

In [6]:
# 데이터 로드
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

# Few-shot 예제를 위한 샘플 5개 추출
few_shot_samples = train_data.sample(5, random_state=42)

# 데이터 확인
few_shot_samples

Unnamed: 0,id,document,label
59770,8932939,수OO만에 다시보네여,1
21362,3681731,일방적인 영화다. 관객 좀 고려해주시길,0
127324,9847174,세상을 초월하는 한 사람의 선한 마음,1
140509,8506899,멍하다.. 여러생각이 겹치는데 오랜만에 영화 보고 이런 느낌 느껴본다,1
144297,9991656,"우와 별 반개도 아까운판에 밑에 CJ 알바생들 쩐다.. 전부 만점이야 ㅎㅎㅎ..,....",0


In [7]:
# Few-shot 프롬프트
few_shot_examples = []
for idx, row in few_shot_samples.iterrows():
    example = [
        {"role": "user", "content": row['document']},
        {"role": "assistant", "content": str(row['label'])}
    ]
    few_shot_examples.extend(example)


## Batch API로 감성 분석

In [8]:
# 테스트 데이터에서 100개의 샘플을 사용
test_data_sample = test_data.sample(100, random_state=42)

tasks = []

for idx, row in test_data_sample.iterrows():
    messages = [
        {"role": "system", "content": "Analyze the sentiment of the following movie review and categorize it strictly as 1 for positive or 0 for negative without providing any explanation or reasoning."}
    ]
    messages.extend(few_shot_examples)
    messages.append({"role": "user", "content": row['document']})

    task = {
        "custom_id": f"task-{idx}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": messages,
            "max_tokens": 60
        }
    }
    tasks.append(task)

# JSONL 파일로 저장
file_name = "batch_tasks_naver_reviews.jsonl"
with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')


In [9]:
# 배치 파일 업로드
batch_input_file = client.files.create(
    file=open(file_name, "rb"),
    purpose="batch"
)

In [10]:
# 배치 작업 생성
batch_job = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [11]:
import time

# 배치 상태 확인
batch_id = batch_job.id
while True:
    batch_status = client.batches.retrieve(batch_id)
    print("Batch 상태:", batch_status)
    if batch_status.status in ['completed', 'failed']:
        break
    time.sleep(60)  # 1분 간격으로 상태 확인


Batch 상태: Batch(id='batch_xRGBALbiNsrue12MWrz7p93r', completion_window='24h', created_at=1722255563, endpoint='/v1/chat/completions', input_file_id='file-kmyGiGpBX9JwxQ6tC6ZjjReZ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722341963, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
Batch 상태: Batch(id='batch_xRGBALbiNsrue12MWrz7p93r', completion_window='24h', created_at=1722255563, endpoint='/v1/chat/completions', input_file_id='file-kmyGiGpBX9JwxQ6tC6ZjjReZ', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722341963, failed_at=None, finalizing_at=None, in_progress_at=1722255564, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(complete

In [12]:
# 결과 파일 가져오기
result_file_id = batch_status.output_file_id
result_content = client.files.content(result_file_id).content

# 결과 파일 저장
result_file_name = "batch_job_results_naver_reviews.jsonl"
with open(result_file_name, 'wb') as file:
    file.write(result_content)

In [13]:
# 결과 파일 로드
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        results.append(json.loads(line.strip()))

results

[{'id': 'batch_req_FQhcFqapxh0zVN4LBCdWEZcu',
  'custom_id': 'task-33553',
  'response': {'status_code': 200,
   'request_id': '7ab5964212df90fd2e055fc05056739f',
   'body': {'id': 'chatcmpl-9qJe2JYjje8jt3dji6XEPz1ZwiRUU',
    'object': 'chat.completion',
    'created': 1722255598,
    'model': 'gpt-4o-mini-2024-07-18',
    'choices': [{'index': 0,
      'message': {'role': 'assistant', 'content': '1'},
      'logprobs': None,
      'finish_reason': 'stop'}],
    'usage': {'prompt_tokens': 272,
     'completion_tokens': 1,
     'total_tokens': 273},
    'system_fingerprint': 'fp_ba606877f9'}},
  'error': None},
 {'id': 'batch_req_ScB6Zq2kyE4F3j2nhOtDm6No',
  'custom_id': 'task-9427',
  'response': {'status_code': 200,
   'request_id': '8883660258d57455634d3b5d0c03e838',
   'body': {'id': 'chatcmpl-9qJeDMyGkhEe3FOZxRyR99kfVlOaD',
    'object': 'chat.completion',
    'created': 1722255609,
    'model': 'gpt-4o-mini-2024-07-18',
    'choices': [{'index': 0,
      'message': {'role': 'assi

## 평가

In [14]:
# 결과 출력 및 메트릭 계산
actuals = test_data_sample['label'].tolist()
predictions = []

for res in results:
    prediction = res['response']['body']['choices'][0]['message']['content'].strip()
    predictions.append(int(prediction))

# 메트릭 계산
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(actuals, predictions)
precision = precision_score(actuals, predictions)
recall = recall_score(actuals, predictions)
f1 = f1_score(actuals, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9100
Precision: 0.9020
Recall: 0.9200
F1 Score: 0.9109
