In [9]:
import re
import pandas as pd
import numpy as np
import torch
from transformers import DebertaTokenizer, DebertaModel
from sklearn.cluster import KMeans
import random
from tqdm.auto import tqdm


In [2]:
# Seed 설정
SEED = 0
np.random.seed(SEED)
random.seed(SEED)

In [3]:
# 데이터 로드
df = pd.read_csv('./data/news.csv')

# 제목과 내용 결합
df['text'] = df['title'] + ' : ' + df['contents']

In [4]:
# 텍스트 전처리
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', '', text)
    return text.lower()

df['processed_text'] = df['text'].apply(preprocess_text)

In [5]:
# DeBERTa 모델과 토크나이저 로드
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaModel.from_pretrained('microsoft/deberta-base').to(device)

In [10]:
# 텍스트 데이터를 배치로 처리하여 특성 추출
batch_size = 32
all_embeddings = []
texts = df['processed_text'].tolist()

# tqdm을 for loop에 적용
for i in tqdm(range(0, len(texts), batch_size), desc="Extracting Features"):
    batch_texts = texts[i:i + batch_size]
    input_ids = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu()
        all_embeddings.append(embeddings)

# 모든 임베딩을 연결
all_embeddings = torch.cat(all_embeddings, dim=0)

# K-Means 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=SEED)
df['kmeans_cluster'] = kmeans.fit_predict(all_embeddings.numpy())


Extracting Features:   0%|          | 0/1875 [00:00<?, ?it/s]

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# 각 클러스터에 대해 처음 3개의 텍스트를 출력합니다.
for cluster in range(6):
    print(f"Cluster {cluster}:")
    texts = df[df['kmeans_cluster'] == cluster]['text'].head(3).tolist()
    for text in texts:
        print(text)
    print()


In [None]:
# 군집에 카테고리 레이블 매핑 및 제출 파일 생성
mapping_dict = {
    0: 4,  # Tech
    1: 2,  # Politics
    2: 1,  # Entertainment
    3: 0,  # Business
    4: 5,  # World
    5: 3   # Sports
}

df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])

In [None]:
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])
submission = df[['id', 'mapping']]
submission.columns = ['id', 'category']

In [None]:
submission.to_csv('./submissions/DeBerTa_submissions.csv', index=False)

print("Submission file saved as DeBerTa_submission.csv")

In [None]:
df