In [2]:
import string
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import re
import os
from glob import glob
from tqdm import tqdm
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def preprocessing(data):
    lowertext = data.lower()
    replacetxt = re.sub('[^ a-zA-Z0-9]',' ',lowertext)
    
    fact = StopWordRemoverFactory()
    stopword = fact.create_stop_word_remover()
    stopwordRem = stopword.remove(str(replacetxt))
    
    token = nltk.word_tokenize(stopwordRem)

    stemF = StemmerFactory()
    stemmer = stemF.create_stemmer()
    baseWord = [stemmer.stem(word) for word in token]
    baseWord = ' '.join(baseWord)
    
    return baseWord

In [4]:
rss = []
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer()
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters, random_state=25)
    km_model.fit(tfidf_model)
    rss.append(km_model.inertia_)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [5]:
base_path = "."
data_path = os.path.join(base_path, "data testing clustering")

In [6]:
files = glob(data_path + "/**/*.news")
inp_cluster = 6

In [7]:
kategori = []
documents = []
for filename in tqdm(files):
    f = open(filename, "r")
    data = f.read()
    f.close()
    data = data.split('\n')
    tag = filename.split('\\')
    tag = tag[2]
    data = data[4]
    data = preprocessing(data)
    documents.append(data)
    kategori.append(tag)

100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [03:38<00:00,  3.46s/it]


In [8]:
kat = np.unique(kategori)
c = {}
for i in range(len(kat)):
    c[kat[i]] = i

In [9]:
clusters = cluster_texts(documents, inp_cluster)
pprint(dict(clusters))

{0: [26, 54, 55, 56, 58, 59, 61],
 1: [10, 11, 12, 13, 14, 15, 16, 18, 20, 52],
 2: [3, 4, 8, 9, 25, 41, 48, 50, 51, 62],
 3: [0, 2, 5, 19, 21, 22, 23, 31, 53, 57, 60],
 4: [1, 6, 7, 17, 42, 43, 44, 45, 46, 49],
 5: [24, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 47]}


In [10]:
def most_frequent(List): 
    counter = 0
    num = List[0] 

    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num

In [11]:
asosiasi = {}
y_true = []
y_pred = []

for i in range(len(files)):
    y_true.append('')
    y_pred.append('')
    
for i in range(len(clusters)):
    taglist = []
    
    for item in clusters[i]:
        tag_true = kategori[item]
        taglist.append(tag_true)
        y_true[item] = tag_true
    tagres = most_frequent(taglist)
    
    for item in clusters[i]:
        y_pred[item]=tagres
        
    asosiasi[tagres] = clusters[i]

In [12]:
asosiasi

{'Olahraga': [26, 54, 55, 56, 58, 59, 61],
 'Edukasi': [10, 11, 12, 13, 14, 15, 16, 18, 20, 52],
 'Bisnis': [3, 4, 8, 9, 25, 41, 48, 50, 51, 62],
 'Internasional': [0, 2, 5, 19, 21, 22, 23, 31, 53, 57, 60],
 'Nasional': [1, 6, 7, 17, 42, 43, 44, 45, 46, 49],
 'Metropolitan': [24, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 47]}

In [13]:
rss

[53.18941922553139]

In [14]:
c

{'Bisnis': 0,
 'Edukasi': 1,
 'Internasional': 2,
 'Metropolitan': 3,
 'Nasional': 4,
 'Olahraga': 5}

In [15]:
y_true = [c[item] for item in y_true]
y_pred = [c[item] for item in y_pred]

In [16]:
from sklearn.metrics import accuracy_score

def purity_score(y_true, y_pred):
    y_voted_labels = np.zeros(y_true.shape)
    labels = np.unique(y_true)
    ordered_labels = np.arange(labels.shape[0])
    for k in range(labels.shape[0]):
        y_true[y_true==labels[k]] = ordered_labels[k]
    # Update unique labels
    labels = np.unique(y_true)
    bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred==cluster], bins=bins)
        # Find the most present label in the cluster
        winner = np.argmax(hist)
        y_voted_labels[y_pred==cluster] = winner
    
    return accuracy_score(y_true, y_voted_labels)

In [17]:
y_true = np.array(y_true)
y_pred = np.array(y_pred)
purity_score(y_true, y_pred)

0.6031746031746031