In [6]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from collections import OrderedDict

from importtxt import ImportTxt
from queryexpansion import QueryExpansion
from praprocessing import PraProcessing
from tfidf import TfIdf
from vsm import VectorSpaceModel

In [7]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stopwords_factory = StopWordRemoverFactory()
stopwords = stopwords_factory.get_stop_words()

impor = ImportTxt()
queryExpansion = QueryExpansion()
praProcess = PraProcessing()
tfIdf = TfIdf()
vSM = VectorSpaceModel()

In [8]:
kata_kunci = "dampak pandemi terhadap pengusaha, pariwisata, hotel, serta mall"

kata_kunci = queryExpansion.stateForQueryExpansion(kata_kunci)
print('QE Kata Kunci ', kata_kunci)

words = []
for word in kata_kunci.split(" "):
            stemmed_word = stemmer.stem(word)
            if word not in stopwords and stemmed_word not in words:
                if word != '':
                    words.append(stemmed_word)

print('words ', words)

path = '../data/*.txt'
docs = impor.export_from_txt(path, kata_kunci)
print("\nDocument Length : ", len(docs))

list_of_word = praProcess.get_list_of_word(docs)
print('list of word ', len(list_of_word))

print("=====================================TFIDF=====================================")
term_frequency = tfIdf.create_term_frequency(list_of_word, len(docs))
for index, sentence in enumerate(docs.values()):
    for word in stemmer.stem(sentence).split(" "):
        if word in term_frequency[index]:
            term_frequency[index][word] += 1
print('Term Frequency ', len(term_frequency))

words ['dampak', 'pandemi', 'terhadap', 'pengusaha,', 'pariwisata,', 'hotel,', 'serta', 'mall']
QE Kata Kunci  pengusaha, juragan produser saudagar usahawan wiraswasta wirausaha dampak akibat buah buntut efek ekor ekses hasil imbas impak impresi konsekuensi pengaruh resultan benturan hantaman tumbukan terhadap pandemi endemi epidemi hawar pagebluk taun wabah serta bersama beserta dan dengan juga mall mercantile establishment outlet retail store sales outlet hotel, building edifice pariwisata, pelancongan turisme wisata 
words  ['usaha', 'juragan', 'produser', 'saudagar', 'usahawan', 'wiraswasta', 'wirausaha', 'dampak', 'akibat', 'buah', 'buntut', 'efek', 'ekor', 'ekses', 'hasil', 'imbas', 'impak', 'impresi', 'konsekuensi', 'pengaruh', 'resultan', 'bentur', 'hantam', 'tumbu', 'pandemi', 'endemi', 'epidemi', 'hawar', 'pagebluk', 'taun', 'wabah', 'sama', 'serta', 'mall', 'mercantile', 'establishment', 'outlet', 'retail', 'store', 'sales', 'hotel', 'building', 'edifice', 'pariwisata', 'lan

In [9]:


document_frequency = tfIdf.create_document_frequency(list_of_word)
for index, sentence in enumerate(term_frequency):
    if index > 0:
        for key, value in sentence.items():
            if value:
                document_frequency[key] += 1  
print('Document Frequency ', len(document_frequency))

d_df = tfIdf.get_d_df(len(docs)-1, document_frequency)
print('D / df ', len(d_df))

idf = tfIdf.get_idf(d_df)
print("Idf Log(df) ", len(idf)) 

w_q_t = tfIdf.get_w_q_t(term_frequency, idf)
print("TF-IDF atau Wqt ", len(w_q_t)) 

Document Frequency  4670
D / df  4670
Idf Log(df)  4670
TF-IDF atau Wqt  409


In [10]:
print("=======================Vector Space Model=======================")

q_d = vSM.get_q_d(w_q_t)
# print("Q/D WQT dipangkatkn 2 dan diakar")

# for x in q_d:
    # print(x['total'])

sum_dj_q = vSM.get_dj_q(w_q_t)
# for x in sum_dj_q:
#     print(x['total'])

sum_qd = vSM.get_sum_of_qd(q_d)
# for x in sum_qd:
#     print(x['total'])

sim = vSM.get_sim(sum_dj_q, sum_qd)

kesimpulan = {}
i = 0
for x in docs.keys():
    if x != 'kata_kunci':
        kesimpulan[x] = sim[i]
    i += 1

sorted_value = OrderedDict(sorted(kesimpulan.items(), key=lambda x: x[1], reverse=True))

# print(sorted_value)
i = 1
for x in list(sorted_value):
    if kesimpulan[x] >= 0.1 :
        print(i, x, kesimpulan[x])
    i += 1



Q/D WQT dipangkatkn 2 dan diakar

Perhitungan Sum (tfidf * tfidf_query) atau dj.q

Perhitungan |dj|.|q| (jarak dokumen * jarak query)
Perhitungan dj.q / |dj|.|q|
1 perjalanan_84.txt 0.315
2 pengusaha_dan_bisnis_73.txt 0.312
3 pengusaha_dan_bisnis_5.txt 0.301
4 pengusaha_dan_bisnis_99.txt 0.295
5 pengusaha_dan_bisnis_81.txt 0.293
6 pengusaha_dan_bisnis_50.txt 0.28
7 pengusaha_dan_bisnis_18.txt 0.279
8 pengusaha_dan_bisnis_12.txt 0.271
9 guru_dan_siswa_48.txt 0.27
10 pengusaha_dan_bisnis_91.txt 0.27
11 pengusaha_dan_bisnis_7.txt 0.255
12 pengusaha_dan_bisnis_85.txt 0.252
13 pengusaha_dan_bisnis_15.txt 0.249
14 pengusaha_dan_bisnis_1.txt 0.249
15 pengusaha_dan_bisnis_75.txt 0.241
16 pengusaha_dan_bisnis_83.txt 0.239
17 tokoh_agama_dan_masyarakat_57.txt 0.238
18 pengusaha_dan_bisnis_82.txt 0.237
19 perjalanan_65.txt 0.235
20 pengusaha_dan_bisnis_65.txt 0.235
21 perjalanan_63.txt 0.228
22 guru_dan_siswa_4.txt 0.226
23 pengusaha_dan_bisnis_52.txt 0.221
24 pengusaha_dan_bisnis_32.txt 0.22
25 