-
Notifications
You must be signed in to change notification settings - Fork 0
/
LDAmodel.py~
124 lines (88 loc) · 4.03 KB
/
LDAmodel.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
import MeCab
import pandas as pd
import gensim
import re
# process in mecab
def get_words(contents):
ret = []
for content in contents:
ret.append(get_words_main(content))
return ret
def get_words_main(content):
return tokenlize(content)
def tokenlize(text):
#mecab = MeCab.Tagger("-Owakati")
#node = mecab.parse(text.encode('utf-8'))
#return node
text=text.encode('utf-8')
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parseToNode(text)#.encode('utf-8'))
keywords = []
while node:
# if len(node.surface) > 1:
# keywords.append(node.surface)
# node = node.next
if node.feature.split(",")[0] == '名詞':
#yield node.surface
if len(node.surface) > 1:
keywords.append(node.surface)
node = node.next
return keywords
#process in LDA model
class lda_parts(object):
'''docstring for lda_parts'''
def __init__(self,sentencelist):
#self.sentencelist = sentencelist
self.wordslist = get_words(sentencelist)
def dictionary_corpus(self, filter = True, load = None ,save = None, show=False): #,no_below=5, no_above=0.75):
if load == None:
dictionary = gensim.corpora.Dictionary(self.wordslist)
# if filter == True:
# dictionary.filter_extremes(no_below,no_above)
else:
dictionary = gensim.corpora.Dictionary.load(load)
self.dictionary = dictionary
if save != None:
self.dictionary.save(save)
self.corpus = [self.dictionary.doc2bow(words) for words in self.wordslist]
#if show == True:
#print(self.dictionary.token2id)
def LDA_model(self,num_topics=150,save=None,load=None,show=False,set_matrix = True):
if load == None:
self.lda = gensim.models.LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=num_topics)
self.lda.save(save)
else:
self.lda = gensim.models.LdaModel.load(load)
if show == True:
for topic in self.lda.show_topics(-1):
print topic
if set_matrix:
self.similarity_matrix()
def similarity_matrix(self):
self.matrix = gensim.similarities.MatrixSimilarity(self.lda[self.corpus])
def similarity_vector(self,p_sentence):
p_corpus = self.dictionary.doc2bow(tokenlize(p_sentence))
return self.matrix[self.lda[p_corpus]]
class News(object):
def __init__(self, PKs, titles ,links, descriptions, filters = True,show=False): # no_below=5, no_above=0.75):
self.entryPKs = PKs
self.links = links
self.title_lda = lda_parts(titles)
self.title_lda.dictionary_corpus(filter=filters,load = ("./model/titles.dictionary"), show=show)#,no_below=no_below, no_above=no_above)
self.title_lda.LDA_model(load=("./model/titles.model"),show=show)
self.title_lda.similarity_matrix()
#self.titile_similarMatrix = self.title_lda.matrix
self.description_lda = lda_parts(descriptions)
self.description_lda.dictionary_corpus(filter=filters,load = ("./model/descriptions.dictionary"))#, show=show,no_below=no_below, no_above=no_above)
self.description_lda.LDA_model(load=("./model/descriptions.model"),show=show)
self.description_lda.similarity_matrix()
#self.description_similarMatrix = self.description_lda.matrix
self.p = re.compile(r"<[^>]*?>")
def predictRelavent(self, p_title, p_description, rate = 1.0, threhold = 0):
p_title_rank = self.title_lda.similarity_vector(self.p.sub("", p_title))
p_description_rank = self.description_lda.similarity_vector(self.p.sub("",p_description))
sim = [(n,(rate+1)*s1*s2/(s1+rate*s2)) for ((n,s1),s2) in zip(enumerate(p_title_rank),p_description_rank) if (s1 !=0 and s2 != 0)]
mostRelavent = sorted([self.entryPKs[n] for (n,x) in sim if x > threhold ])
l=len(mostRelavent)[:3]
return mostRelavent[:3]