In [None]:
import pandas as pd
import ollama
import json
import re
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Import tqdm for progress bar

# -----------------------------
# Classify a batch of articles
# -----------------------------
def classify_batch(df_chunk):
    articles_text = ""
    for i, row in df_chunk.iterrows():
        # Convert Series to dict for safe access
        row_dict = row.to_dict()
        tags_str = ""
        if isinstance(row_dict.get("tags", ""), list):
            tags_str = ", ".join(str(t) for t in row_dict["tags"])
        elif isinstance(row_dict.get("tags", ""), str):
            tags_str = row_dict["tags"]

        articles_text += f"""
{i+1}. Title: "{row_dict['title']}"
     Abstract: "{row_dict['description']}"
     Keywords: "{tags_str}"
"""

    # Full prompt (no shortening)
    prompt = f"""
You are a professional news analyst. 
Your task is to classify each news article into **exactly one primary category** 
out of the six predefined categories below. 

Use both the **title**, **abstract**, and **keywords** to decide, 
and follow the **priority rules** when multiple aspects are present.  

---

### Categories and Rules:

1. Research (학술)
- 포함: 논문, 프리프린트, 학회 채택/수상, 벤치마크·데이터셋 공개.
- 제외: 제품 릴리스 노트(→ Technology).
- 경계 규칙: “학회/논문 성과가 리드”면 Research 우선.

2. Technology & Product (기술/제품)
- 포함: 모델/제품 릴리스, 성능 업데이트, 모델 카드/리드미 변경, 기능 개선.
- 제외: 자금/거래 중심 기사(→ Market & Corporate).
- 경계 규칙: “연구 성과”보다 “제품·기능” 전달이 리드면 여기.

3. Market & Corporate (시장/기업)
- 포함: 투자·M&A·IPO·실적, 리더십/조직개편, 제휴·상용화 계약, 상업 로드맵.
- 제외: 공공 규제·지원(→ Policy & Regulation).
- 경계 규칙: 금액·거래·실적·지배구조가 리드면 여기.

4. Policy & Regulation (정책/규제)
- 포함: 법·규제·가이드라인, 공공자금(보조금·RFP), 수출통제, 표준화·거버넌스.
- 제외: 기업 자체 정책(가격·상업 전략, → Market & Corporate).
- 경계 규칙: 공공 주체의 룰/지원이 핵심이면 여기.

5. Society & Culture (사회/문화)
- 포함: 대중 활용 트렌드, 창작·교육·밈, 저작권/윤리 공론의 사회적 논의(정책화 전 단계).
- 제외: 순수 기술 업데이트(→ Technology), 입법·규제(→ Policy).
- 경계 규칙: “사회적 파급·수용”이 리드면 여기.

6. Incidents & Safety (사건/안전/운영)
- 포함: 서비스 장애, 보안 사고/유출, 모델 오남용/중대한 안전 이슈, 리콜/중단.
- 제외: 일반 제품 릴리스(→ Technology), 법·제재(→ Policy).
- 경계 규칙: “사건/사건/리스크 대응”이 중심이면 여기.

---

### Priority Rules (apply in order if overlaps):
1. Incidents & Safety
2. Policy & Regulation
3. Market & Corporate
4. Research
5. Technology & Product
6. Society & Culture

---

### Output format:
Return ONLY a valid JSON list, with no extra text, like this:
[
  {{"id": 1, "category": "Research"}},
  {{"id": 2, "category": "Policy & Regulation"}}
]

Here are the articles:
{articles_text}
"""

    # Call Ollama
    response = ollama.chat(
        model="gemma3:4b",
        messages=[{"role": "user", "content": prompt}]
    )

    content = response["message"]["content"].strip()

    # Extract JSON safely
    try:
        result_list = json.loads(content)
        categories = [r["category"] for r in result_list]
        return categories
    except json.JSONDecodeError:
        categories = re.findall(r'"category"\s*:\s*"([^"]+)"', content)
        if len(categories) == len(df_chunk):
            return categories
        return ["Uncategorized"] * len(df_chunk)


# -----------------------------
# Helper to split dataframe into batches
# -----------------------------
def chunkify(df, chunk_size=5):
    return [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]


# -----------------------------
# Threaded batch classification with tqdm
# -----------------------------
def classify_dataframe_batch_parallel_thread(df, chunk_size=5, max_workers=2):
    chunks = chunkify(df, chunk_size)
    categories = []

    # Use tqdm to wrap the chunks for progress tracking
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map classify_batch to chunks and track progress with tqdm
        results = list(tqdm(executor.map(classify_batch, chunks), total=len(chunks), desc="Classifying articles"))
        for r in results:
            categories.extend(r)

    df["category"] = categories
    return df


# -----------------------------
# Single-row test helper
# -----------------------------
def classify_single_row(df, row_index=0):
    row = df.iloc[row_index]
    category = classify_batch(pd.DataFrame([row]))[0]  # returns list
    print(f"Title: {row['title']}")
    print(f"Predicted category: {category}")
    return category


# -----------------------------
# Main function to process JSONL file
# -----------------------------
def process_jsonl(input_file, output_file=r".\data\output\all_entries_categories.jsonl"):
    # Load JSONL from file
    try:
        df = pd.read_json(input_file, lines=True)
    except ValueError as e:
        print(f"Error reading JSONL file: {e}")
        return None
    
    # Classify with progress bar
    df = classify_dataframe_batch_parallel_thread(df)
    
    # Output to JSONL file
    df.to_json(output_file, orient='records', lines=True)
    print(f"output saved to {output_file}")
    return df.head()


# Example usage
jsonl_file = r".\data\output\all_entries_keywords.jsonl"
process_jsonl(jsonl_file)

Output saved to output.jsonl


Unnamed: 0,guid,source,title,link,pub_date,description,author,category,tags,group,scraped_at
0,https://openai.com/index/accelerating-life-sci...,"OpenAI Blog (공식, 변경 가능성 주의)",Accelerating life sciences research,https://openai.com/index/accelerating-life-sci...,"Fri, 22 Aug 2025 08:30:00 GMT","Discover how a specialized AI model, GPT-4b mi...",,Research,"[org/OpenAI, model/GPT-4b, topic/Robotics, dom...",frontier_lab,2025-08-25 02:51:21.590956
