In [None]:
import pandas as pd

# Try UTF-8 first
try:
    df = pd.read_csv(".\Data\ai_news_large.csv", encoding="utf-8")
except UnicodeDecodeError:
    # Fallback to latin-1 if utf-8 fails
    df = pd.read_csv(".\Data\ai_news_large.csv", encoding="latin-1")

 - TF-IDF version 키워드 추출

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Combine title + abstract
df['text_for_keywords'] = df['Title'] + " " + df['Abstract']

# Fit TF-IDF on the whole corpus
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))  # unigrams + bigrams
tfidf_matrix = vectorizer.fit_transform(df['text_for_keywords'])
feature_names = vectorizer.get_feature_names_out()

# Function to get top N keywords per article
def get_top_keywords_per_article(row_vec, feature_names, top_n=5):
    row = row_vec.toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # indices of top scores
    return [feature_names[i] for i in top_indices]

# Apply to each row
df['keywords'] = [", ".join(get_top_keywords_per_article(tfidf_matrix[i], feature_names, top_n=5))
                  for i in range(tfidf_matrix.shape[0])]

# Preview
print(df[['Title', 'Abstract', 'keywords']].head())


                                               Title  \
0  Google discontinues Clips, the AI-powered came...   
1  AI can help doctors spot brain hemorrhages faster   
2  Pentagon's draft AI ethics guidelines fight bi...   
3  Google Says New AI-Powered Search Update Is 'H...   
4  Sonar drone helps find a WWII Japanese aircraf...   

                                            Abstract  \
0  While Google was busy showcasing its latest de...   
1  AI is already capable of discovering medical c...   
2  Tech companies might have trouble establishing...   
3  Google is injecting its search engine with new...   
4  The late Paul Allen's underwater robotics are ...   

                                            keywords  
0  clips, camera, google, users meant, promised r...  
1  brain hemorrhages, hemorrhages, brain, accurac...  
2  trouble establishing, trouble, companies troub...  
3  search, google, says new, queries handles, des...  
4  carrier, japanese, aircraft, aircraft carrier,..

In [9]:
import pandas as pd

# Set display option to show full content
pd.set_option('display.max_colwidth', None)

# Now print the Series
print(df['keywords'])



0                          clips, camera, google, users meant, promised record
1           brain hemorrhages, hemorrhages, brain, accuracy, radiologists test
2      trouble establishing, trouble, companies trouble, rogue, pentagon draft
3             search, google, says new, queries handles, described significant
4             carrier, japanese, aircraft, aircraft carrier, japanese aircraft
                                        ...                                   
142       opioid, drug diversion, stealing drugs, stealing, pharmacists nurses
143                           alphabet, boston dynamics, boston, dynamics, day
144                 steps, seeking, searchers seeking, searchers, simple steps
145                        multi planet, multi, planet systems, planet, kepler
146                                  pixel, pixel like, older, compact, phones
Name: keywords, Length: 147, dtype: object


- Yake 키워드 추출

In [10]:
## yake 
import pandas as pd
import yake

# Load MIND dataset CSV
df['text_for_keywords'] = df['Title'] + " " + df['Abstract']


# YAKE settings
language = "en"
max_ngram_size = 3  # can extract up to 3-word phrases
num_keywords = 5    # top 5 keywords per article
dedup_threshold = 0.9  # avoid very similar keywords

# Initialize YAKE extractor
yake_extractor = yake.KeywordExtractor(lan=language,
                                       n=max_ngram_size,
                                       dedupLim=dedup_threshold,
                                       top=num_keywords,
                                       features=None)

# Extract keywords per article
df['keywords'] = df['text_for_keywords'].apply(lambda x: [kw for kw, score in yake_extractor.extract_keywords(x)])

# Optionally convert to comma-separated string
df['keywords_str'] = df['keywords'].apply(lambda kws: ", ".join(kws))

# Preview
print(df[['Title', 'Abstract', 'keywords_str']].head())

                                                                 Title  \
0    Google discontinues Clips, the AI-powered camera you forgot about   
1                    AI can help doctors spot brain hemorrhages faster   
2  Pentagon's draft AI ethics guidelines fight bias and rogue machines   
3      Google Says New AI-Powered Search Update Is 'Huge Step Forward'   
4              Sonar drone helps find a WWII Japanese aircraft carrier   

                                                                                                                                                                                                                                                                                                                                                                                                                                                               Abstract  \
0  While Google was busy showcasing its latest devices yesterday, it was also, more quietly, pul

In [11]:
df['keywords']

0                          [latest devices yesterday, Google discontinues Clips, devices yesterday, pulling the plug, busy showcasing]
1                 [discovering medical conditions, doctors spot brain, brain hemorrhages faster, spot brain hemorrhages, doctors spot]
2      [rogue machines Tech, machines Tech companies, ethics guidelines fight, guidelines fight bias, trouble establishing groundwork]
3                                        [Huge Step Forward, AI-Powered Search Update, change top executives, Huge Step, Step Forward]
4          [Paul Allen underwater, late Paul Allen, Allen underwater robotics, discovering long-lost warships, WWII Japanese aircraft]
                                                                    ...                                                               
142                                         [Big Tech tackles, diversion opioid crisis, Big Tech, drug diversion opioid, Tech tackles]
143          [performing assistive tasks, Boston Dynami

- Ollama 와 Instruction table (from Notion) 이용한 태깅 

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import ollama

# ---------------- Controlled Vocabulary ----------------
controlled_vocab = {
    'org': ['OpenAI', 'Anthropic', 'Naver', 'Google', 'Microsoft', 'NVIDIA', 'MIT', 'Facebook', 'Apple', 'Intel', 'Sony', 'Honeywell', 'Oracle', 'SenseTime'],
    'model': ['GPT-6', 'Claude-3.7', 'Genie', 'Assistant', 'Azure', 'Mini Cheetah', 'Smart Compose'],
    'domain': ['Healthcare', 'Fintech', 'Education', 'Transportation', 'Robotics'],
    'topic': ['Multimodal', 'RAG', 'Agents', 'Safety', 'Robotics'],
    'event': ['NeurIPS2025', 'GoogleIO', 'WWDC', 'MAX'],
    'geo': ['KR', 'US', 'EU', 'CN'],
    'biz': ['M&A', 'Funding', 'Earnings', 'Pricing', 'Hiring'],
    'policy': ['Regulation', 'Standard', 'Grant']
}

# ---------------- Ollama tagging function ----------------
def get_tags_with_ollama(title, content, yake_keywords, vocab):
    vocab_text = "\n".join([f"{k}: {', '.join(v)}" for k, v in vocab.items()])
    yake_text = ", ".join(yake_keywords)

    prompt = f"""
You are an expert tagger for AI-related articles. Your task is to generate relevant tags in the format 'category/keyword' based on the provided controlled vocabulary and YAKE keywords.

**Controlled Vocabulary**:
{vocab_text}

**YAKE Keywords** (for additional context):
{yake_text}

**Rules**:
1. Prioritize tags from the controlled vocabulary when the title or content matches exactly or closely.
2. If a YAKE keyword or content term doesn't match the vocabulary but is relevant, propose a new tag within allowed categories.
3. Capitalize keywords in tags for consistency.
4. Output only the tags, comma-separated, in the format 'category/Keyword'.

**Article**:
Title: {title}
Content: {content}

**Output**:
Comma-separated tags in the format 'category/Keyword'
"""

    try:
        response = ollama.chat(
            model="gemma2:latest",
            messages=[{"role": "user", "content": prompt}]
        )
        tags_text = response["message"]["content"].strip()
        tags = [t.strip() for t in tags_text.split(",") if t.strip()]
        return tags
    except Exception as e:
        print(f"Error calling Ollama for '{title}': {e}")
        return []

# ---------------- Process a batch ----------------
def process_batch(batch_rows):
    batch_tags = []
    for _, row in batch_rows.iterrows():
        title = row['Title']
        abstract = row['Abstract']
        yake_kw = row.get('keywords', [])
        if isinstance(yake_kw, str):
            try:
                yake_kw = eval(yake_kw)
            except:
                yake_kw = yake_kw.split(",")
        tags = get_tags_with_ollama(title, abstract, yake_kw, controlled_vocab)
        batch_tags.append(tags)
    return batch_tags

# ---------------- Parallel batch processing ----------------
def process_dataframe_parallel(df, batch_size=5, max_workers=2):
    # Split df into batches
    batches = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]
    all_tags = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_batch, batch): batch for batch in batches}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing batches"):
            try:
                all_tags.extend(future.result())
            except Exception as e:
                print(f"Batch processing error: {e}")

    df['tags'] = all_tags
    return df

# ---------------- Usage ----------------
# df = pd.read_csv("your_file.csv")  # load your dataframe
df = process_dataframe_parallel(df, batch_size=5, max_workers=2)

print(df[['Title', 'tags']].head())
