-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
74 lines (59 loc) · 2.26 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import Preprocessing as pre
import TermWeighting as termW
import InformationRetrieval as IR
import TextSummarization as summary
import numpy
from Output import Output
# Open file
source = open("clustering-class.txt", "r")
source = source.read()
output = Output()
# Preprocessing
tokenized = pre.tokenization(source)
stemmed = pre.stemming(tokenized)
documents = pre.filtering(stemmed)
# Term Weighting
terms = pre.termFromDocuments(documents)
# binaryWeight = termW.binaryTermWeighting(terms, documents)
rawWeight = termW.rawTermWeighting(terms, documents)
logWeight = termW.logTermWeighting(terms, documents)
# Information Retrieval
df = termW.documentFrequency(terms, documents)
idf = termW.inverseDocumentFrequency(df, documents)
tf_idf = termW.tf_idf(logWeight, idf)
wtd_normalized = IR.normalization(tf_idf)
query = pre.tokenization('burung terbang.')
query = pre.stemming(query)
query = pre.filtering(query)
query_weight = termW.logTermWeighting(terms, query)
query_weight = termW.tf_idf(query_weight, idf)
query_normalized = IR.normalization(query_weight)
query_normalized = numpy.array(query_normalized[0])
cosine_similarity = IR.cosine_similarity(wtd_normalized, query_normalized)
distances = IR.distance(cosine_similarity)
ranked = IR.ranked_retrieval(distances)
print('similarity')
distances = list(distances)
for similarity in ranked:
print(distances.index(similarity), similarity)
# Summarization
base_tf = summary.sum_tf(rawWeight)
base_tf = summary.log_tf(base_tf)
base_tf = summary.new_tf(rawWeight, base_tf)
base_tf = summary.document_weight(base_tf)
base_tf = list(base_tf)
print('summary')
sorted = numpy.sort(base_tf)[::-1]
for i in range(int(len(sorted)/2)):
print(base_tf.index(sorted[i]), sorted[i])
# output.write_pre(documents, "tokenization")
# output.write_pre(documents, "stemming")
# output.write_pre(documents, "filtering")
# output.column_number = 0
# output.write_term_weight(terms, binaryWeight, "Binary term frequency")
# output.write_term_weight(terms, rawWeight, "Raw term frequency")
# output.write_term_weight(terms, logWeight, "Log term frequency")
# output.write_doc_frequency(df, "Document frequencies")
# output.write_doc_frequency(idf, "Inverse Document Frequencies")
# output.write_term_weight(terms, tf_idf, "tf * idf")
# output.save("result.xls")