In [9]:
import pandas as pd

# Try UTF-8 first
try:
    df = pd.read_csv("ai_news_large.csv", encoding="utf-8")
except UnicodeDecodeError:
    # Fallback to latin-1 if utf-8 fails
    df = pd.read_csv("ai_news_large.csv", encoding="latin-1")

In [10]:
df

Unnamed: 0.1,Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,3958,N2804,news,newsscienceandtechnology,"Google discontinues Clips, the AI-powered came...",While Google was busy showcasing its latest de...,https://assets.msn.com/labs/mind/AAIT1gq.html,"[{""Label"": ""Google"", ""Type"": ""O"", ""WikidataId""...","[{""Label"": ""Google Store"", ""Type"": ""O"", ""Wikid..."
1,8804,N96829,news,newsscienceandtechnology,AI can help doctors spot brain hemorrhages faster,AI is already capable of discovering medical c...,https://assets.msn.com/labs/mind/AAJgflU.html,"[{""Label"": ""Artificial intelligence"", ""Type"": ...","[{""Label"": ""University of California, San Fran..."
2,12538,N118770,news,newsscienceandtechnology,Pentagon's draft AI ethics guidelines fight bi...,Tech companies might have trouble establishing...,https://assets.msn.com/labs/mind/AAJKAXe.html,"[{""Label"": ""Artificial intelligence"", ""Type"": ...","[{""Label"": ""Artificial intelligence"", ""Type"": ..."
3,12690,N110386,news,newsscienceandtechnology,Google Says New AI-Powered Search Update Is 'H...,Google is injecting its search engine with new...,https://assets.msn.com/labs/mind/AAJknM1.html,"[{""Label"": ""Google"", ""Type"": ""O"", ""WikidataId""...","[{""Label"": ""Google"", ""Type"": ""O"", ""WikidataId""..."
4,16892,N83002,news,newsscienceandtechnology,Sonar drone helps find a WWII Japanese aircraf...,The late Paul Allen's underwater robotics are ...,https://assets.msn.com/labs/mind/AAJ5fru.html,"[{""Label"": ""World War II"", ""Type"": ""E"", ""Wikid...","[{""Label"": ""Percival Petrel"", ""Type"": ""V"", ""Wi..."
...,...,...,...,...,...,...,...,...,...
142,288430,N51029,news,newsscienceandtechnology,Big Tech tackles the drug diversion opioid cri...,"Some 10% of pharmacists, nurses, and anesthesi...",https://assets.msn.com/labs/mind/BBWZFbL.html,"[{""Label"": ""Big Four tech companies"", ""Type"": ...",[]
143,288877,N105010,news,newsscienceandtechnology,Alphabet's rebooted robotics program starts wi...,For all the advances made by robot companies l...,https://assets.msn.com/labs/mind/BBXa8XG.html,"[{""Label"": ""Alphabet Inc."", ""Type"": ""O"", ""Wiki...","[{""Label"": ""Alphabet Inc."", ""Type"": ""O"", ""Wiki..."
144,290798,N78685,news,newsscienceandtechnology,2 Simple Steps to Withstand Google's Constant ...,You must do the best job on the internet at de...,https://assets.msn.com/labs/mind/BBWsVjJ.html,"[{""Label"": ""Google"", ""Type"": ""O"", ""WikidataId""...",[]
145,291546,N86909,video,science,Wild Animation Shows All the Multi-Planet Syst...,The tiny dots represent all the multi-planet s...,https://assets.msn.com/labs/mind/BBX3Lcd.html,[],"[{""Label"": ""NASA"", ""Type"": ""O"", ""WikidataId"": ..."


In [11]:
## yake 
import pandas as pd
import yake

# Load MIND dataset CSV
df['text_for_keywords'] = df['Title'] + " " + df['Abstract']


# YAKE settings
language = "en"
max_ngram_size = 3  # can extract up to 3-word phrases
num_keywords = 5    # top 5 keywords per article
dedup_threshold = 0.9  # avoid very similar keywords

# Initialize YAKE extractor
yake_extractor = yake.KeywordExtractor(lan=language,
                                       n=max_ngram_size,
                                       dedupLim=dedup_threshold,
                                       top=num_keywords,
                                       features=None)

# Extract keywords per article
df['keywords'] = df['text_for_keywords'].apply(lambda x: [kw for kw, score in yake_extractor.extract_keywords(x)])

# Optionally convert to comma-separated string
df['keywords_str'] = df['keywords'].apply(lambda kws: ", ".join(kws))

# Preview
print(df[['Title', 'Abstract', 'keywords_str']].head())

                                               Title  \
0  Google discontinues Clips, the AI-powered came...   
1  AI can help doctors spot brain hemorrhages faster   
2  Pentagon's draft AI ethics guidelines fight bi...   
3  Google Says New AI-Powered Search Update Is 'H...   
4  Sonar drone helps find a WWII Japanese aircraf...   

                                            Abstract  \
0  While Google was busy showcasing its latest de...   
1  AI is already capable of discovering medical c...   
2  Tech companies might have trouble establishing...   
3  Google is injecting its search engine with new...   
4  The late Paul Allen's underwater robotics are ...   

                                        keywords_str  
0  latest devices yesterday, Google discontinues ...  
1  discovering medical conditions, doctors spot b...  
2  rogue machines Tech, machines Tech companies, ...  
3  Huge Step Forward, AI-Powered Search Update, c...  
4  Paul Allen underwater, late Paul Allen, Allen ..

In [12]:
df.iloc[0]

Unnamed: 0                                                        3958
News ID                                                          N2804
Category                                                          news
SubCategory                                   newsscienceandtechnology
Title                Google discontinues Clips, the AI-powered came...
Abstract             While Google was busy showcasing its latest de...
URL                      https://assets.msn.com/labs/mind/AAIT1gq.html
Title Entities       [{"Label": "Google", "Type": "O", "WikidataId"...
Abstract Entities    [{"Label": "Google Store", "Type": "O", "Wikid...
text_for_keywords    Google discontinues Clips, the AI-powered came...
keywords             [latest devices yesterday, Google discontinues...
keywords_str         latest devices yesterday, Google discontinues ...
Name: 0, dtype: object

In [13]:
import pandas as pd
import ollama
import json
import re
from concurrent.futures import ThreadPoolExecutor

# -----------------------------
# Classify a batch of articles
# -----------------------------
def classify_batch(df_chunk):
    articles_text = ""
    for i, row in df_chunk.iterrows():
        # Convert Series to dict for safe access
        row_dict = row.to_dict()
        keywords_str = ""
        if isinstance(row_dict.get("keywords", ""), list):
            keywords_str = ", ".join(str(k) for k in row_dict["keywords"])
        elif isinstance(row_dict.get("keywords", ""), str):
            keywords_str = row_dict["keywords"]

        articles_text += f"""
{i+1}. Title: "{row_dict['Title']}"
     Abstract: "{row_dict['Abstract']}"
     Keywords: "{keywords_str}"
"""

    # Full prompt (no shortening)
    prompt = f"""
You are a professional news analyst. 
Your task is to classify each news article into **exactly one primary category** 
out of the six predefined categories below. 

Use both the **title**, **abstract**, and **keywords** to decide, 
and follow the **priority rules** when multiple aspects are present.  

---

### Categories and Rules:

1. Research (학술)
- 포함: 논문, 프리프린트, 학회 채택/수상, 벤치마크·데이터셋 공개.
- 제외: 제품 릴리스 노트(→ Technology).
- 경계 규칙: “학회/논문 성과가 리드”면 Research 우선.

2. Technology & Product (기술/제품)
- 포함: 모델/제품 릴리스, 성능 업데이트, 모델 카드/리드미 변경, 기능 개선.
- 제외: 자금/거래 중심 기사(→ Market & Corporate).
- 경계 규칙: “연구 성과”보다 “제품·기능” 전달이 리드면 여기.

3. Market & Corporate (시장/기업)
- 포함: 투자·M&A·IPO·실적, 리더십/조직개편, 제휴·상용화 계약, 상업 로드맵.
- 제외: 공공 규제·지원(→ Policy & Regulation).
- 경계 규칙: 금액·거래·실적·지배구조가 리드면 여기.

4. Policy & Regulation (정책/규제)
- 포함: 법·규제·가이드라인, 공공자금(보조금·RFP), 수출통제, 표준화·거버넌스.
- 제외: 기업 자체 정책(가격·상업 전략, → Market & Corporate).
- 경계 규칙: 공공 주체의 룰/지원이 핵심이면 여기.

5. Society & Culture (사회/문화)
- 포함: 대중 활용 트렌드, 창작·교육·밈, 저작권/윤리 공론의 사회적 논의(정책화 전 단계).
- 제외: 순수 기술 업데이트(→ Technology), 입법·규제(→ Policy).
- 경계 규칙: “사회적 파급·수용”이 리드면 여기.

6. Incidents & Safety (사건/안전/운영)
- 포함: 서비스 장애, 보안 사고/유출, 모델 오남용/중대한 안전 이슈, 리콜/중단.
- 제외: 일반 제품 릴리스(→ Technology), 법·제재(→ Policy).
- 경계 규칙: “사건/사고/리스크 대응”이 중심이면 여기.

---

### Priority Rules (apply in order if overlaps):
1. Incidents & Safety
2. Policy & Regulation
3. Market & Corporate
4. Research
5. Technology & Product
6. Society & Culture

---

### Output format:
Return ONLY a valid JSON list, with no extra text, like this:
[
  {{"id": 1, "category": "Research"}},
  {{"id": 2, "category": "Policy & Regulation"}}
]

Here are the articles:
{articles_text}
"""

    # Call Ollama
    response = ollama.chat(
        model="gemma3:4b",
        messages=[{"role": "user", "content": prompt}]
    )

    content = response["message"]["content"].strip()

    # Extract JSON safely
    try:
        result_list = json.loads(content)
        categories = [r["category"] for r in result_list]
        return categories
    except json.JSONDecodeError:
        categories = re.findall(r'"category"\s*:\s*"([^"]+)"', content)
        if len(categories) == len(df_chunk):
            return categories
        return ["Uncategorized"] * len(df_chunk)


# -----------------------------
# Helper to split dataframe into batches
# -----------------------------
def chunkify(df, chunk_size=5):
    return [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]


# -----------------------------
# Threaded batch classification
# -----------------------------
def classify_dataframe_batch_parallel_thread(df, chunk_size=5, max_workers=2):
    chunks = chunkify(df, chunk_size)
    categories = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(classify_batch, chunks)
        for r in results:
            categories.extend(r)

    df["category"] = categories
    return df


# -----------------------------
# Single-row test helper
# -----------------------------
def classify_single_row(df, row_index=0):
    row = df.iloc[row_index]
    category = classify_batch(pd.DataFrame([row]))[0]  # returns list
    print(f"Title: {row['Title']}")
    print(f"Predicted category: {category}")
    return category



df = classify_dataframe_batch_parallel_thread(df)  # classify all articles
# print(df.head())


In [15]:
df['category'].value_counts()

category
Technology & Product    43
Society & Culture       33
Market & Corporate      22
Uncategorized           20
Research                16
Policy & Regulation      8
Incidents & Safety       5
Name: count, dtype: int64

- uncategory in to misc

In [18]:
df[df['category'] == "Uncategorized"][['Title', 'Abstract', 'category', 'keywords']]

Unnamed: 0,Title,Abstract,category,keywords
15,Facebook emphasizes women's safety on social m...,"It's using AI, machine learning and policy to ...",Uncategorized,"[Facebook emphasizes women, emphasizes women s..."
16,Activists warn UN about dangers of using AI to...,A Nobel Peace prize winner has warned against ...,Uncategorized,"[Nobel Peace prize, Peace prize winner, Nobel ..."
17,Google researchers taught an AI to recognize s...,"For decades, perfumers and scientists have str...",Uncategorized,"[smells For decades, Google researchers taught..."
18,These Researchers Are Using AI Drones to More ...,Researchers are looking to new advances in com...,Uncategorized,"[Safely Track Wildlife, Track Wildlife Researc..."
19,Microsoft is poised to add machine-reading res...,Microsoft looks ready to commercialize more of...,Uncategorized,"[Microsoft Search Microsoft, comprehension cap..."
110,Humana partners with Microsoft to 'reimagine h...,Humana will utilize the Washington-based tech ...,Uncategorized,"[giant Azure cloud, Washington-based tech gian..."
111,Facebook AI can 'hide' people from facial reco...,Facebook has already stopped using facial reco...,Uncategorized,"[facial recognition Facebook, facial recogniti..."
112,Mark Zuckerberg Teases AI 'Brain Chip' But It ...,If Silicon Valley were to put a team of tech b...,Uncategorized,"[Mark Zuckerberg Teases, man Elon Musk, Mark Z..."
113,Healthcare risk algorithm had 'significant rac...,There's more evidence of algorithms demonstrat...,Uncategorized,"[significant racial bias, demonstrating racial..."
114,Microsoft to Attend Pentagon Summit on Project...,The Defense Department is talking to companies...,Uncategorized,"[Attend Pentagon Summit, artificial intelligen..."


In [None]:
#df.to_csv(r".\Data\MINDS\ai_news_categorized.csv", index=False)