Import 

In [1]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize

Load Data & Pre-processing

In [2]:
df = pd.read_csv('./data/news.csv')
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']

def preprocess_text(text):
    # URL 제거  
    text = re.sub(r'//www.huffingtonpost.com/entry/', '', text)
    text = re.sub(r'//www.climatecentral.org/news/', '', text)
    text = re.sub(r'//www.washingtonpost.com/', '', text)
    text = re.sub(r'//www.torrentfreak.com/', '', text)
    text = re.sub(r'//www.businessweek.com/articles/', '', text)
    text = re.sub(r'//allthingsd.com/', '', text)
    text = re.sub(r'//www.cbsnews.com/news/', '', text)
    text = re.sub(r'//www.huffingtonpost.com/entry/', '', text)
    text = re.sub(r'//www.buzzfeed.com/', '', text)
    text = re.sub(r'//insidemovies.ew.com/', '', text)
    text = re.sub(r'//www.huffingtonpost.com/entry/', '', text)
    text = re.sub(r'//www.ew.com/', '', text)
    text = re.sub(r'//nymag.com/daily/', '', text)
    text = re.sub(r'//nymag.com/thecut/', '', text)
    text = re.sub(r'//insidemovies.ew.com/', '', text)
    text = re.sub(r'//thinkprogress.org/', '', text)
    text = re.sub(r'//hoh.rollcall.com/', '', text)
    text = re.sub(r'//www.bloomberg.com/', '', text)
    text = re.sub(r'//www.huffingtonpost.com/entry/', '', text)
    text = re.sub(r'//www.fastcompany.com/', '', text)
    text = re.sub(r'//www.gossipcop.com/', '', text)
    text = re.sub(r'//www.bloomberg.com/', '', text)
    text = re.sub(r'//www.engadget.com/', '', text)
    text = re.sub(r'//247wallst.com/', '', text)
    text = re.sub(r'//247wallst.com/special-report/', '', text)
    text = re.sub(r'//gizmodo.com/', '', text)
    text = re.sub(r'//graphics.latimes.com/', '', text)
    text = re.sub(r'//fivethirtyeight.com/features/', '', text)
    text = re.sub(r'//apps.bostonglobe.com/', '', text)
    text = re.sub(r'//pubx.co/.*', '', text)
     
    text = re.sub(r'html', '', text)
    text = re.sub(r'short_description', '', text)
    text = re.sub(r'\\', ' ', text)
    
    text = re.sub(r'#\w+', '', text)
    
    text = re.sub(r'@\w+', '', text)
    
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    text = re.sub(r'\s+', ' ', text).strip()

    text = re.sub(r'\d+', '', text)
    
    return text.lower()

def remove_single_char_func(text, threshold=1):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > threshold])
    return text

def remove_punctuation_func(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

def remove_extra_whitespaces_func(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_text'] = df['processed_text'].apply(remove_single_char_func)
df['processed_text'] = df['processed_text'].apply(remove_punctuation_func)
df['processed_text'] = df['processed_text'].apply(remove_extra_whitespaces_func)
df.head()

Unnamed: 0,id,title,contents,text,processed_text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...,spanish coach facing action in race row madrid...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,...",bruce lee statue for divided city in bosnia wh...
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive s tilda swinton talks a...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...,macromedia contributes to ebay stores macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...


Feature Extraction

In [4]:
model = SentenceTransformer('thenlper/gte-large')
sentence_embeddings = model.encode(df['text'].tolist())
df_embeddings = pd.DataFrame(sentence_embeddings)

Clustering

In [5]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=42)

df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


Post-processing

In [11]:
df[df['kmeans_cluster'] == 5]['text'].head()

7     Bump Stock Maker Resumes Sales One Month After...
19    Congress Spikes Handout For Private Equity aut...
20    Deere's Color Is Green : With big tractors, bi...
27    Kmart-Sears merger about price, quality : Aver...
49    Bribery Considered, Halliburton Notes Suggest ...
Name: text, dtype: object

Mapping

In [12]:
# 0 busin
# 1 enter
# 2 politics
# 3 sport
# 4 tech
# 5 world
mapping_dict = {
    0: 5, 
    1: 3,
    2: 2, 
    3: 4,
    4: 1,
    5: 0,
}

In [13]:
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])
sample = pd.read_csv('./data/sample_submission.csv')
sample['category'] = df['mapping'].values
sample['category'].head()
sample.to_csv('./data/final_submit.csv', index=False)