## Import

In [1]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

## Random Seed

In [2]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

## Load Data

In [4]:
df = pd.read_csv('./data/news.csv')
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [5]:
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


## Pre-processing

In [6]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 숫자 제거
    text = re.sub(r'\d+', '', text)
    
    return text.lower()

In [7]:
df['processed_text'] = df['text'].apply(preprocess_text)

## Feature Extraction

In [8]:
# Sentence BERT 모델 로드
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# 텍스트 feature 추출
sentence_embeddings = model.encode(df['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## Clustering

In [9]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=SEED)

df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Post-processing

### Entertainment: 0 -> 1

In [10]:
df[df['kmeans_cluster'] == 0]['text'].head(3)

18    A Fair Way to Choose Candidates for Republican...
25    Be on TOP : //www.huffingtonpost.com/entry/be-...
33    Memo To EPA Chief Pruitt : //www.huffingtonpos...
Name: text, dtype: object

In [28]:
print(df['text'][18])
print(df['text'][25])
print(df['text'][33])

A Fair Way to Choose Candidates for Republican Debate : //www.huffingtonpost.com/entry/a-fair-way-to-choose-cand_b_7922194.html short_description
Be on TOP : //www.huffingtonpost.com/entry/be-on-top-amazon-best-sel_b_12508618.html short_description
Memo To EPA Chief Pruitt : //www.huffingtonpost.com/entry/memo-to-epa-chief-pruitt-lets-end-subsidies-for-fossil_us_59ee9567e4b0b8a51417bcc6 short_description


### Sports: 1 -> 3

In [12]:
df[df['kmeans_cluster'] == 1]['text'].head(3)

0     Spanish coach facing action in race row : MADR...
13    GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTT...
22    College Basketball: Georgia Tech, UConn Win : ...
Name: text, dtype: object

In [13]:
print(df['text'][0])
print(df['text'][13])
print(df['text'][22])

Spanish coach facing action in race row : MADRID (AFP) - Spanish national team coach Luis Aragones faces a formal investigation after Spain #39;s Football Federation decided to open disciplinary proceedings over racist comments about Thierry Henry of France and Arsenal.
GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTTE, North Carolina (Ticker) -- The Detroit Shock face a critical road test Saturday when they take on the Charlotte Sting at Charlotte Coliseum.
College Basketball: Georgia Tech, UConn Win : ATLANTA (Sports Network) - BJ Elder poured in a game-high 27 points to lead fourth-ranked Georgia Tech to a convincing 99-68 win over Michigan in the ACC-Big Ten Challenge at Alexander Memorial Coliseum.


### Politics: 2 -> 2

In [14]:
df[df['kmeans_cluster'] == 2]['text'].head(3)

11    Kerry rolls out tax-cut plan for middle class ...
20    Deere's Color Is Green : With big tractors, bi...
50    UN Predicts Boom In Robot Labor : The use of r...
Name: text, dtype: object

In [15]:
print(df['text'][2])
print(df['text'][6])
print(df['text'][7])

Only Lovers Left Alive's Tilda Swinton Talks About Almost Quitting Acting and Yasmine Hamdan Performs 'Hal' Live In NYC   (HuffPo Exclusive Videos) authors : Yasmine Hamdan performs 'Hal' which she also sings in the film during a scene when two world-weary vampires begin to heal and find a way to continue living as they remember the power and mystery of creation itself.
Time to Talk Baseball : It's time to talk about the serious risks and potential benefits of building an expensive ballpark in Washington.
Bump Stock Maker Resumes Sales One Month After Las Vegas Mass Shooting authors : Move along nothing to see here.


### Business: 3 -> 0

In [16]:
df[df['kmeans_cluster'] == 3]['text'].head(3)

1     Bruce Lee statue for divided city : In Bosnia,...
10    Harry #39;s argy-bargy : PRINCE Charles has as...
16    Fischer's Fiancee: Marriage Plans Genuine (AP)...
Name: text, dtype: object

In [17]:
print(df['text'][11])
print(df['text'][20])
print(df['text'][50])

Kerry rolls out tax-cut plan for middle class : After two weeks of focusing on Iraq, Democratic presidential challenger John Kerry turned his emphasis to the economy Saturday, delivering what he called a plan for  quot;middle-class families.
Deere's Color Is Green : With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.
UN Predicts Boom In Robot Labor : The use of robots around the home to mow lawns, vacuum floors and manage other chores is set to surge sevenfold by 2007 as more consumers snap up smart machines, the United Nations said.


### Tech: 4 -> 4

In [18]:
df[df['kmeans_cluster'] == 4]['text'].head(3)

2    Only Lovers Left Alive's Tilda Swinton Talks A...
6    Time to Talk Baseball : It's time to talk abou...
7    Bump Stock Maker Resumes Sales One Month After...
Name: text, dtype: object

In [19]:
print(df['text'][3])
print(df['text'][4])
print(df['text'][5])

Macromedia contributes to eBay Stores : Macromedia has announced a special version of its Contribute website editing application designed to simplify the creation and customisation of eBay Stores.
Qualcomm plans to phone it in on cellular repairs : Over-the-air fixes for cell phones comes to Qualcomm's CDMA.
Thomson to Back Both Blu-ray and HD-DVD : Company, one of the core backers of Blu-ray, will also support its rival format.


### World: 5 -> 5

In [20]:
df[df['kmeans_cluster'] == 5]['text'].head(3)

3    Macromedia contributes to eBay Stores : Macrom...
4    Qualcomm plans to phone it in on cellular repa...
5    Thomson to Back Both Blu-ray and HD-DVD : Comp...
Name: text, dtype: object

In [21]:
print(df['text'][18])
print(df['text'][25])
print(df['text'][33])

A Fair Way to Choose Candidates for Republican Debate : //www.huffingtonpost.com/entry/a-fair-way-to-choose-cand_b_7922194.html short_description
Be on TOP : //www.huffingtonpost.com/entry/be-on-top-amazon-best-sel_b_12508618.html short_description
Memo To EPA Chief Pruitt : //www.huffingtonpost.com/entry/memo-to-epa-chief-pruitt-lets-end-subsidies-for-fossil_us_59ee9567e4b0b8a51417bcc6 short_description


In [30]:
# 각 클러스터에 대해 처음 3개의 텍스트를 출력합니다.
for cluster in range(6):
    print(f"Cluster {cluster}:")
    texts = df[df['kmeans_cluster'] == cluster]['text'].head(5).tolist()
    for text in texts:
        print(text)
    print()


Cluster 0:
A Fair Way to Choose Candidates for Republican Debate : //www.huffingtonpost.com/entry/a-fair-way-to-choose-cand_b_7922194.html short_description
Be on TOP : //www.huffingtonpost.com/entry/be-on-top-amazon-best-sel_b_12508618.html short_description
Memo To EPA Chief Pruitt : //www.huffingtonpost.com/entry/memo-to-epa-chief-pruitt-lets-end-subsidies-for-fossil_us_59ee9567e4b0b8a51417bcc6 short_description
Satire Will Not Save Us : //www.huffingtonpost.com/entry/tal-fortgang-satire-will-not-save-us_b_5283369.html short_description
WATCH : //www.huffingtonpost.com/entry/perrish-cox-flop-49ers-saints_n_6129774.html short_description

Cluster 1:
Spanish coach facing action in race row : MADRID (AFP) - Spanish national team coach Luis Aragones faces a formal investigation after Spain #39;s Football Federation decided to open disciplinary proceedings over racist comments about Thierry Henry of France and Arsenal.
GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTTE, North Carolina (Tick

### Mapping

In [31]:
mapping_dict = {
    0: 2,  # Politics
    1: 3,  # Sports
    2: 0,  # Business
    3: 5,  # World
    4: 1,  # Entertainment
    5: 4   # Tech
}


In [32]:
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])

## Submission

In [33]:
sample = pd.read_csv('./submissions/sample_submission.csv')

In [34]:
sample['category'] = df['mapping'].values
sample['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [35]:
sample.to_csv('./submissions/baseline_submit.csv', index=False)