-
Notifications
You must be signed in to change notification settings - Fork 0
/
topic_model.py
96 lines (74 loc) · 2.99 KB
/
topic_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from gensim.models import LdaModel
import pickle
import spacy
from random import randint
class topicMod(object):
def __init__(self):
open_file = open("./pickled_files/test_documents_stripped.pickle", "rb")
self.test_documents = pickle.load(open_file)
open_file.close()
self.test_documents=[text for text in self.test_documents if len(text.split())>150]
open_file = open("./pickled_files/bigram_model.pickle", "rb")
self.bigram_model = pickle.load(open_file)
open_file.close()
open_file = open("./pickled_files/trigram_model.pickle", "rb")
self.trigram_model = pickle.load(open_file)
open_file.close()
self.nlp = spacy.load('en')
self.lda = LdaModel.load('./pickled_files/lda_final_model')
open_file = open("./pickled_files/trigram_dictionary.pickle", "rb")
self.trigram_dictionary = pickle.load(open_file)
open_file.close()
self.topic_names={
0: 'miscellaneous',
1: 'medicine',
2: 'miscellaneous 1',
3: 'religion',
4: 'sports',
5: 'software',
6: 'encryption',
7: 'guns',
8: 'politics',
9: 'computing',
10: 'foreign politics',
11: 'politics',
12: 'cars',
13: 'politics 1',
14: 'middle east/cars mix',
15: 'computing',
16: 'christianity',
17: 'space',
18: 'computing 1',
19: 'religious/atheist discussion'
}
def get_document(self):
randn = randint(0,len(self.test_documents))
return self.test_documents[randn]
def takeSecond(self,elem):
return elem[1]
def punct_space(self,token):
return token.is_punct or token.is_space or token.is_stop
def lda_description(self,text, min_topic_freq=0.05):
parsed_doc = self.nlp(text)
unigram_doc=[]
for token in parsed_doc:
if not self.punct_space(token):
if token.lemma_ == '-PRON-':
unigram_doc.append(token.orth_)
elif token.lemma_ not in spacy.en.STOP_WORDS:
unigram_doc.append(token.lemma_)
bigram_doc = self.bigram_model[unigram_doc]
trigram_doc = self.trigram_model[bigram_doc]
trigram_doc = [term for term in trigram_doc]
trigram_doc = [term for term in trigram_doc if len(term)>1]
doc_bow = self.trigram_dictionary.doc2bow(trigram_doc)
doc_lda = self.lda[doc_bow]
doc_lda = sorted(doc_lda, key=self.takeSecond, reverse=True)
print("{} \n".format(text))
print("Topics found: \n")
for topic_number, freq in doc_lda:
if freq < min_topic_freq:
break
# print the most highly related topic names and frequencies
print('{:35} {}'.format(self.topic_names[topic_number],
round(freq, 3)))