In [2]:
import pandas as pd

# Try UTF-8 first
try:
    df = pd.read_csv(r".\Data\Input\ai_news_large.csv", encoding="utf-8")
except UnicodeDecodeError:
    # Fallback to latin-1 if utf-8 fails
    df = pd.read_csv(r".\Data\Input\ai_news_large.csv", encoding="latin-1")

- Yake 키워드 추출

In [3]:
## yake 
import pandas as pd
import yake

# Load MIND dataset CSV
df['text_for_keywords'] = df['Title'] + " " + df['Abstract']


# YAKE settings
language = "en"
max_ngram_size = 3  # can extract up to 3-word phrases
num_keywords = 5    # top 5 keywords per article
dedup_threshold = 0.9  # avoid very similar keywords

# Initialize YAKE extractor
yake_extractor = yake.KeywordExtractor(lan=language,
                                       n=max_ngram_size,
                                       dedupLim=dedup_threshold,
                                       top=num_keywords,
                                       features=None)

# Extract keywords per article
df['keywords'] = df['text_for_keywords'].apply(lambda x: [kw for kw, score in yake_extractor.extract_keywords(x)])

# Optionally convert to comma-separated string
df['keywords_str'] = df['keywords'].apply(lambda kws: ", ".join(kws))

# Preview
print(df[['Title', 'Abstract', 'keywords_str']].head())

                                               Title  \
0  Google discontinues Clips, the AI-powered came...   
1  AI can help doctors spot brain hemorrhages faster   
2  Pentagon's draft AI ethics guidelines fight bi...   
3  Google Says New AI-Powered Search Update Is 'H...   
4  Sonar drone helps find a WWII Japanese aircraf...   

                                            Abstract  \
0  While Google was busy showcasing its latest de...   
1  AI is already capable of discovering medical c...   
2  Tech companies might have trouble establishing...   
3  Google is injecting its search engine with new...   
4  The late Paul Allen's underwater robotics are ...   

                                        keywords_str  
0  latest devices yesterday, Google discontinues ...  
1  discovering medical conditions, doctors spot b...  
2  rogue machines Tech, machines Tech companies, ...  
3  Huge Step Forward, AI-Powered Search Update, c...  
4  Paul Allen underwater, late Paul Allen, Allen ..

In [4]:
df['keywords']

0      [latest devices yesterday, Google discontinues...
1      [discovering medical conditions, doctors spot ...
2      [rogue machines Tech, machines Tech companies,...
3      [Huge Step Forward, AI-Powered Search Update, ...
4      [Paul Allen underwater, late Paul Allen, Allen...
                             ...                        
142    [Big Tech tackles, diversion opioid crisis, Bi...
143    [performing assistive tasks, Boston Dynamics, ...
144    [Withstand Google Constant, Google Constant Up...
145    [Kepler Space Telescope, Systems Kepler Discov...
146    [Assistant starts reaching, Assistant upgrades...
Name: keywords, Length: 147, dtype: object

- Ollama 와 Instruction table (from Notion) 이용한 태깅 

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import ollama

# ---------------- Controlled Vocabulary ----------------
controlled_vocab = {
    'org': ['OpenAI', 'Anthropic', 'Naver', 'Google', 'Microsoft', 'NVIDIA', 'MIT', 'Facebook', 'Apple', 'Intel', 'Sony', 'Honeywell', 'Oracle', 'SenseTime'],
    'model': ['GPT-6', 'Claude-3.7', 'Genie', 'Assistant', 'Azure', 'Mini Cheetah', 'Smart Compose'],
    'domain': ['Healthcare', 'Fintech', 'Education', 'Transportation', 'Robotics'],
    'topic': ['Multimodal', 'RAG', 'Agents', 'Safety', 'Robotics'],
    'event': ['NeurIPS2025', 'GoogleIO', 'WWDC', 'MAX'],
    'geo': ['KR', 'US', 'EU', 'CN'],
    'biz': ['M&A', 'Funding', 'Earnings', 'Pricing', 'Hiring'],
    'policy': ['Regulation', 'Standard', 'Grant']
}

# ---------------- Ollama tagging function ----------------
def get_tags_with_ollama(title, content, yake_keywords, vocab):
    vocab_text = "\n".join([f"{k}: {', '.join(v)}" for k, v in vocab.items()])
    yake_text = ", ".join(yake_keywords)

    prompt = f"""
You are an expert tagger for AI-related articles. Your task is to generate relevant tags in the format 'category/keyword' based on the provided controlled vocabulary and YAKE keywords.

**Controlled Vocabulary**:
{vocab_text}

**YAKE Keywords** (for additional context):
{yake_text}

**Rules**:
1. Prioritize tags from the controlled vocabulary when the title or content matches exactly or closely.
2. If a YAKE keyword or content term doesn't match the vocabulary but is relevant, propose a new tag within allowed categories.
3. Capitalize keywords in tags for consistency.
4. Output only the tags, comma-separated, in the format 'category/Keyword'.

**Article**:
Title: {title}
Content: {content}

**Output**:
Comma-separated tags in the format 'category/Keyword'
"""

    try:
        response = ollama.chat(
            model="gemma2:latest",
            messages=[{"role": "user", "content": prompt}]
        )
        tags_text = response["message"]["content"].strip()
        tags = [t.strip() for t in tags_text.split(",") if t.strip()]
        return tags
    except Exception as e:
        print(f"Error calling Ollama for '{title}': {e}")
        return []

# ---------------- Process a batch ----------------
def process_batch(batch_rows):
    batch_tags = []
    # Initialize tqdm for batch with row count in description
    for _, row in tqdm(batch_rows.iterrows(), total=len(batch_rows), desc=f"Tagging {len(batch_rows)} articles"):
        title = row['Title']
        abstract = row['Abstract']
        yake_kw = row.get('keywords', [])
        if isinstance(yake_kw, str):
            try:
                yake_kw = eval(yake_kw)
            except:
                yake_kw = yake_kw.split(",")
        tags = get_tags_with_ollama(title, abstract, yake_kw, controlled_vocab)
        batch_tags.append(tags)
    return batch_tags

# ---------------- Parallel batch processing ----------------
def process_dataframe_parallel(df, batch_size=5, max_workers=2):
    # Split df into batches
    batches = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]
    all_tags = []
    total_rows_processed = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_batch, batch): batch for batch in batches}
        # Initialize tqdm for batches with total row count
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {len(df)} rows in {len(batches)} batches"):
            try:
                batch_tags = future.result()
                all_tags.extend(batch_tags)
                total_rows_processed += len(futures[future])
                # Update tqdm description to show rows processed
                tqdm.write(f"Processed {total_rows_processed}/{len(df)} rows")
            except Exception as e:
                print(f"Batch processing error: {e}")

    df['tags'] = all_tags
    return df

# ---------------- Usage ----------------
# Sample DataFrame for demonstration
sample_data = {
    'Title': [
        "OpenAI Unveils GPT-6 for Healthcare Applications",
        "Google's New AI Model at NeurIPS2025",
        "Naver Develops Robotics Solution for Fintech",
        "Microsoft Azure Enhances Multimodal AI",
        "Anthropic's Claude-3.7 Focuses on Safety"
    ],
    'Abstract': [
        "OpenAI's GPT-6 is designed for healthcare, improving diagnostics with multimodal capabilities.",
        "Google showcases a new AI model at NeurIPS2025, targeting education and transportation.",
        "Naver's robotics solution integrates AI for fintech, launched in KR.",
        "Microsoft Azure's latest update supports multimodal AI for various domains.",
        "Anthropic's Claude-3.7 emphasizes safety in AI agents."
    ],
    'keywords': [
        ['healthcare', 'diagnostics', 'multimodal'],
        ['education', 'transportation', 'neural networks'],
        ['robotics', 'fintech', 'korea'],
        ['multimodal', 'cloud computing'],
        ['safety', 'agents']
    ]
}
df = pd.DataFrame(sample_data)

# Process the DataFrame
df = process_dataframe_parallel(df, batch_size=2, max_workers=2)

# Print results
print("\nResults:")
print(df[['Title', 'tags']].to_string(index=False))


Tagging 2 articles: 100%|██████████| 2/2 [01:56<00:00, 58.38s/it]


[A[A                                                               
[A                                                              
[A



Processed 2/5 rows


Tagging 2 articles: 100%|██████████| 2/2 [02:44<00:00, 82.13s/it], 116.76s/it][A[A
                                                         

Tagging 1 articles:   0%|          | 0/1 [00:47<?, ?it/s]                     
[A
[A

Processed 4/5 rows


In [None]:
df.to_csv(r".\Data\Output\ai_news_large_tagged.csv", index=False)