In [1]:
import os
import pandas as pd
import numpy as np
from keybert import KeyBERT
from kiwipiepy import Kiwi
from transformers import BertModel
from tqdm import tqdm
import json

model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
kiwi = Kiwi()

np.random.seed(123)

In [2]:
data_dir = "../Data/News/Processed"
file_dirs = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.csv')]

file_dirs

['../Data/News/Processed\\005930_2019.csv',
 '../Data/News/Processed\\005930_2020.csv',
 '../Data/News/Processed\\005930_2021.csv',
 '../Data/News/Processed\\005930_2022.csv',
 '../Data/News/Processed\\005930_2023.csv',
 '../Data/News/Processed\\005930_2024.csv',
 '../Data/News/Processed\\005930_2025.csv']

Keyword Categories

In [3]:
category_dict = {}

def Count_dict(dictionary: dict, entity: str) -> dict:
    if entity not in dictionary:
        dictionary[entity] = 1
    else:
        dictionary[entity] += 1
    return dictionary

for file in file_dirs:
    df = pd.read_csv(file, encoding="utf-8")
    if "Section" in df.columns:
        sections = df["Section"].tolist()
        for entity in sections:
            if isinstance(entity, str):
                cats = entity.split("/")
                for e in cats:
                    category_dict = Count_dict(category_dict, e.strip())
                    

total_count = sum(category_dict.values())
for key in category_dict:
    category_dict[key] = category_dict[key] / total_count

category_dict

{'경제': 0.46494109651110105,
 'IT': 0.16740296027790363,
 '정치': 0.062415043044857274,
 '세계': 0.09052635553541762,
 '사회': 0.187320646428032,
 '생활': 0.021956653073553843,
 '오피니언': 0.0037758646730101193,
 '총선': 0.0016613804561244525}

In [4]:
ID_dict = {}

for file in file_dirs:
    df = pd.read_csv(file, encoding="utf-8")
    if "Section" in df.columns and "ID" in df.columns:
        for idx, row in df.iterrows():
            article_id = row["ID"]
            section = row["Section"]
            if isinstance(section, str):
                cats = section.split("/")
                for cat in cats:
                    cat = cat.strip()
                    if cat:
                        if cat not in ID_dict:
                            ID_dict[cat] = []
                        ID_dict[cat].append(article_id)
                        
# ID_dict

In [5]:
select_N = 5000

sample_counts = {}
for key, ratio in category_dict.items():
    sample_counts[key] = int(np.ceil(ratio * select_N))

Selected_ID_dict = {}
for key in ID_dict:
    n_available = len(ID_dict[key])
    n_sample = min(n_available, sample_counts.get(key, 0))
    if n_sample > 0:
        selected_ids = np.random.choice(ID_dict[key], n_sample, replace=False)
        Selected_ID_dict[key] = selected_ids.tolist()
    else:
        Selected_ID_dict[key] = []

# Selected_ID_dict

In [None]:
select = 'Title'

In [None]:
selected_ids_all = set()
for ids in Selected_ID_dict.values():
    selected_ids_all.update(ids)

print("전체 샘플링된 기사 ID 개수:", len(selected_ids_all))

result_keywords_by_category = {cat: {} for cat in ID_dict.keys()}

for file in file_dirs:
    df = pd.read_csv(file, encoding="utf-8")
    if "Body" in df.columns and "ID" in df.columns and "Section" in df.columns:
        sampled_df = df[df["ID"].isin(selected_ids_all)]
        print(f"파일: {os.path.basename(file)}, 샘플링된 기사 개수: {len(sampled_df)}")
        for index, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
            text = row[select]
            section = row["Section"]
            if isinstance(text, str) and isinstance(section, str):
                nouns_list = []
                for sentence in kiwi.analyze(text):
                    for token in sentence[0]:
                        if token.tag.startswith('NN'):
                            nouns_list.append(token.form)
                if nouns_list:
                    result_text = ' '.join(nouns_list)
                    keywords = kw_model.extract_keywords(result_text,
                                                         keyphrase_ngram_range=(1, 1),
                                                         stop_words=None,
                                                         top_n=20)
                    categories = [cat.strip() for cat in section.split("/") if cat.strip()] 
                    for cat in categories:
                        if cat in result_keywords_by_category:
                            for kw, score in keywords:
                                result_keywords_by_category[cat][kw] = result_keywords_by_category[cat].get(kw, 0) + score

for cat, kw_dict in result_keywords_by_category.items():
    sorted_kw = dict(sorted(kw_dict.items(), key=lambda x: x[1], reverse=True))
    result_keywords_by_category[cat] = sorted_kw

print("카테고리별 누적 키워드 점수:")
for cat, kw_dict in result_keywords_by_category.items():
    print(f"{cat}: {kw_dict}")


with open(f'../Data/News/Keywords/keywords_by_category_{select.lower()}.json', 'w', encoding='utf-8') as f:
    json.dump(result_keywords_by_category, f, ensure_ascii=False, indent=4)

전체 샘플링된 기사 ID 개수: 4762
파일: 005930_2019.csv, 샘플링된 기사 개수: 760


100%|██████████| 760/760 [00:12<00:00, 61.82it/s]


파일: 005930_2020.csv, 샘플링된 기사 개수: 723


100%|██████████| 723/723 [00:10<00:00, 67.12it/s]


파일: 005930_2021.csv, 샘플링된 기사 개수: 655


100%|██████████| 655/655 [00:09<00:00, 71.07it/s]


파일: 005930_2022.csv, 샘플링된 기사 개수: 695


100%|██████████| 695/695 [00:09<00:00, 73.23it/s]


파일: 005930_2023.csv, 샘플링된 기사 개수: 765


100%|██████████| 765/765 [00:10<00:00, 72.88it/s]


파일: 005930_2024.csv, 샘플링된 기사 개수: 942


100%|██████████| 942/942 [00:13<00:00, 69.85it/s]


파일: 005930_2025.csv, 샘플링된 기사 개수: 222


100%|██████████| 222/222 [00:03<00:00, 68.43it/s]

카테고리별 누적 키워드 점수:
경제: {'삼성전자': 579.2688000000005, '삼성': 400.21260000000007, '종합': 251.9826999999999, '반도체': 233.44389999999987, '코스피': 204.47390000000001, '갤럭시': 179.5323, '이재용': 137.27729999999997, '출시': 99.48790000000001, '기업': 98.24300000000005, '미국': 89.5876999999999, '부회장': 85.58270000000003, '한국': 80.13739999999999, '중국': 62.4724, '회장': 62.393199999999965, '코스닥': 60.04210000000001, '분기': 54.6892, '상승': 50.9525, '하락': 48.66910000000001, '연합뉴스': 45.074400000000054, '공개': 45.05469999999999, '투자': 40.3927, '기술': 40.296399999999984, '일본': 39.84719999999999, '공장': 39.357499999999995, '스마트폰': 38.1654, '실적': 36.86679999999999, '국내': 36.84010000000001, '이건희': 36.697300000000006, '헤드라인': 36.39999999999996, '기관': 34.88019999999999, 'sk하이닉스': 34.81510000000001, '제품': 33.896, '시각': 32.80000000000003, '코로나': 32.4372, '매수': 32.3855, '글로벌': 32.379000000000005, '최대': 31.649900000000017, '세계': 31.6132, '특징': 31.4739, '작년': 31.042099999999994, '시장': 30.95980000000001, '초반': 30.728200000000005, '참석':




In [None]:
keyword_to_categories = {}
for category, kw_dict in result_keywords_by_category.items():
    for kw in kw_dict:
        if kw not in keyword_to_categories:
            keyword_to_categories[kw] = set()
        keyword_to_categories[kw].add(category)

# 2. 각 카테고리에서 고유한 키워드(다른 카테고리에는 없는 키워드)만 추출합니다.
unique_keywords = {}
for category, kw_dict in result_keywords_by_category.items():
    unique_keywords[category] = {kw: score for kw, score in kw_dict.items() if len(keyword_to_categories[kw]) == 1}

# 결과 출력
print(unique_keywords)