- Yake 키워드 추출 

In [None]:
import json
import yake
import ollama
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def extract_keywords_yake(input_file, output_file, max_keywords=5):
    kw_extractor = yake.KeywordExtractor(top=max_keywords, stopwords=None)
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            try:
                entry = json.loads(line.strip())
                text = (entry.get('title', '') + " " + entry.get('description', '')).strip()
                if text:
                    keywords = [kw for kw, score in kw_extractor.extract_keywords(text)]
                else:
                    keywords = []
                entry['keywords'] = keywords
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON line: {e}")
            except Exception as e:
                print(f"Error processing entry: {e}")

---


In [None]:
# Controlled vocabulary
controlled_vocab = {
    'org': ['OpenAI', 'Anthropic', 'Naver', 'Google', 'Microsoft', 'NVIDIA', 'MIT', 'Facebook', 'Apple', 'Intel', 'Sony', 'Honeywell', 'Oracle', 'SenseTime'],
    'model': ['GPT-6', 'Claude-3.7', 'Genie', 'Assistant', 'Azure', 'Mini Cheetah', 'Smart Compose'],
    'domain': ['Healthcare', 'Fintech', 'Education', 'Transportation', 'Robotics'],
    'topic': ['Multimodal', 'RAG', 'Agents', 'Safety', 'Robotics'],
    'event': ['NeurIPS2025', 'GoogleIO', 'WWDC', 'MAX'],
    'geo': ['KR', 'US', 'EU', 'CN'],
    'biz': ['M&A', 'Funding', 'Earnings', 'Pricing', 'Hiring'],
    'policy': ['Regulation', 'Standard', 'Grant']
}

def get_tags_with_ollama(title, content, yake_keywords, vocab, model_name="gemma3:4b"):
    vocab_text = "\n".join([f"{k}: {', '.join(v)}" for k, v in vocab.items()])
    yake_text = ", ".join(yake_keywords)
    
    prompt = f"""
You are an expert tagger for AI-related articles. Your task is to generate relevant tags in the format 'category/keyword' based on the provided controlled vocabulary and YAKE keywords.

**Controlled Vocabulary**:
{vocab_text}

**YAKE Keywords**:
{yake_text}

**Rules**:
1. Prioritize tags from the controlled vocabulary when the title or content matches exactly or closely.
2. If a YAKE keyword or content term doesn't match the vocabulary but is relevant, propose a new tag within allowed categories.
3. Capitalize keywords in tags for consistency.
4. Output only comma-separated tags in 'category/Keyword'.

**Article**:
Title: {title}
Content: {content}

**Output**:
"""
    try:
        response = ollama.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
        raw_tags = response['message']['content'].strip()
        tag_list = [t.strip() for t in raw_tags.split(",") if t.strip()]
        return tag_list
    except Exception as e:
        print(f"Error calling Ollama: {e}")
        return []

# Worker function for a batch
def process_batch(batch):
    updated_entries = []
    # Add tqdm for processing entries within a batch
    for entry in tqdm(batch, desc="Tagging entries", leave=False):
        title = entry.get('title', '')
        content = entry.get('description', '')
        keywords = entry.get('keywords', [])
        tags = get_tags_with_ollama(title, content, keywords, controlled_vocab)
        entry['tags'] = tags
        updated_entries.append(entry)
    return updated_entries

# Main parallel processing function
def generate_tags_parallel(input_file, output_file, batch_size=5, max_workers=2):
    with open(input_file, 'r', encoding='utf-8') as f:
        all_entries = [json.loads(line.strip()) for line in f if line.strip()]
    
    batches = [all_entries[i:i+batch_size] for i in range(0, len(all_entries), batch_size)]
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_batch, batch): batch for batch in batches}
        # Add tqdm for batch processing
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing batches"):
            try:
                results.extend(future.result())
            except Exception as e:
                print(f"Error processing a batch: {e}")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in results:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')
    
    print(f"Final JSONL with tags saved to: {output_file}")

# ------------------ Main Execution ------------------

if __name__ == "__main__":
    raw_input_file = r".\Data\Input\all_entries_20250825_025249 (1).jsonl"
    keyworded_file = r".\Data\Output\articles_with_keywords.jsonl"
    final_tagged_file = r".\Data\Output\all_entries_tags.jsonl"

    #print("Step 1: Extracting YAKE keywords...")
    #extract_keywords_yake(raw_input_file, keyworded_file, max_keywords=5)

    print("Step 2: Generating tags via Ollama...")
    generate_tags_parallel(keyworded_file, final_tagged_file, batch_size=5, max_workers=2)

Step 2: Generating tags via Ollama...
