In [1]:
!pip install tomotopy

Collecting tomotopy
  Downloading tomotopy-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tomotopy
Successfully installed tomotopy-0.12.4


In [2]:
import tomotopy as tp
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.base import BaseEstimator
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import ast
data_clean = pd.read_csv('/content/drive/MyDrive/Bahan Penelitian/dataset penelitian/data_bersih.csv')
data_clean['isi_stemmed'] = data_clean['isi_stemmed'].apply(ast.literal_eval)

corpus = tp.utils.Corpus()
for doc in data_clean['isi_stemmed']:
    if doc:
        corpus.add_doc(doc)

In [4]:
class TopicWrapper(BaseEstimator):
    def __init__(self, k1:int, k2:int, top_n:int=25, train_iter:int=500,
                 random_state:int=42, num_workers:int=1) -> None:
        super().__init__()
        self.random_state = random_state
        self.k1 = k1
        self.k2 = k2
        self.train_iter = train_iter
        self.top_n = top_n
        self.num_workers = num_workers
        self.model = None

    def __init_model__(self):
        return tp.PAModel(tw=tp.TermWeight.PMI, min_cf=10,
                          k1=self.k1, k2=self.k2, seed=self.random_state)

    def fit(self, X, **kwargs):
        corpus = tp.utils.Corpus()
        for doc in X:
            if doc:
                corpus.add_doc(doc)
        self.model = self.__init_model__()
        self.model.add_corpus(corpus)
        self.model.burn_in = 100
        self.model.train(self.train_iter, workers=self.num_workers)
        return self

    def predict(self, X):
        infered_corpus, ll = self.model.infer(X)
        return infered_corpus, ll

    def score(self, *args, **kwargs) -> float:
        return tp.coherence.Coherence(self.model, coherence="c_v").get_score()

    def set_params(self, **params):
        self.model = None
        return super().set_params(**params)

In [None]:
model = TopicWrapper(k1=1, k2=1, top_n=50, num_workers=1, train_iter=500, random_state=42)
param_grid = []

for i in range(1, 5):
    for j in range(i, 20):
        param_grid.append({"k1": [i], "k2": [j]})

search = GridSearchCV(model, param_grid,cv=4, verbose=2)
result = search.fit(corpus)
clear_output()
print("Best Params :")
print(result.best_params_)

Best Params :
{'k1': 3, 'k2': 16}


In [5]:
# Set the best model
best_k1 = 3
best_k2 = 16

model = tp.PAModel(tw=tp.TermWeight.PMI, min_cf=10,
                   k1=best_k1,  k2= best_k2, corpus=corpus, seed=42)
model.burn_in = 100
model.train(1000, workers=1)


In [None]:
model.k2

16

In [8]:
# get sub topics
top_n = 3
for k in range(best_k1):
    print("SUPER TOPIC", k)
    print("sub topic:")
    print([item[0] for item in model.get_sub_topics(k,top_n=top_n)])
    print("==========================")

# Terdapat 3 super topik dengan masing masing penyusun sub topic itu. contoh super topik 1 terdiri dari sub topik [14,15,6]

SUPER TOPIC 0
sub topic:
[14, 15, 6]
SUPER TOPIC 1
sub topic:
[15, 6, 0]
SUPER TOPIC 2
sub topic:
[15, 14, 0]


In [None]:
for k in range(best_k1):
  print([item for item in model.get_sub_topic_dist(k,normalize=True)])

In [7]:
# get topic words
top_n = 10
for k in range(best_k2):
    print("TOPIC", k)
    print("Words:")
    print([item[0] for item in model.get_topic_words(k, top_n=top_n)])
    print("==========================")


TOPIC 0
Words:
['bantu', 'ganjar', 'jateng', 'desa', 'tani', 'banjir', 'rumah', 'kabupaten', 'program', 'nelayan']
TOPIC 1
Words:
['jakarta', 'piala dunia', 'heru budi', 'fifa', 'dki jakarta', 'heru', 'israel', 'gubernur', 'gubernur dki', 'tuan rumah']
TOPIC 2
Words:
['pemilu', 'kpu', 'bawaslu', 'gugat', 'kampanye', 'partai', 'putus', 'partai ummat', 'serta milu', 'atur']
TOPIC 3
Words:
['giat', 'ganjar', 'masyarakat', 'latih', 'milenial', 'srikandi ganjar', 'ganjar pranowo', 'pemuda', 'perempuan', 'budaya']
TOPIC 4
Words:
['jatim', 'nu', 'khofifah', 'surabaya', 'jalan', 'prabowo', 'kiai', 'rp', 'ulama', 'jawa timur']
TOPIC 5
Words:
['aceh', 'spanduk', 'baliho', 'sepeda', 'pasang', 'anies', 'transjakarta', 'halte', 'jalan', 'sopir truk']
TOPIC 6
Words:
['gerindra', 'ppp', 'kib', 'pkb', 'golkar', 'prabowo', 'sandiaga', 'koalisi', 'imin', 'partai gerindra']
TOPIC 7
Words:
['pdip', 'jokowi', 'ganjar', 'hasto', 'rawan', 'megawati', 'dukung', 'pdi juang', 'rudy', 'puan']
TOPIC 8
Words:
['pr

In [None]:
for k in range(best_k2):
    print("TOPIC", k)
    print("distribusi:")
    print([item for item in model.get_topic_word_dist(k,normalize=True)])
    print("==========================")


In [None]:
# get count by super topic
model.get_count_by_super_topic()

# jumlah kata yang dialokasikan untuk setiap supertopic

array([379247, 388253, 367516], dtype=uint64)

In [None]:
# get count by topics
model.get_count_by_topics()

array([111541,  62432,  64801,  67935,  22603,  29413, 110243,  96304,
        43910,  72728,  32703,  43537,  68927,  48347,  95855, 163737],
      dtype=uint64)