https://mp.weixin.qq.com/s?__biz=MzI0NTQ1NDc4Nw==&mid=2247485451&idx=1&sn=eb7a785dbd8728ff2d16c72429d7f461&chksm=e94f0aa7de3883b10f51c8de23702ea1277f2d70eae64570c60711c89cf159bd5b7004dee9f4&scene=21#wechat_redirect

In [1]:

from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import numpy

In [2]:

texts =  [
  ['苹果','叶子','椭圆形','树上'],           
  ['植物','叶子','绿色','落叶乔木'],           
  ['水果','苹果','红彤彤','味道'],            
  ['苹果','落叶乔木','树上','水果'],           
  ['植物','营养','维生素'],           
  ['营养','维生素','苹果','成分'],            
  ['互联网','电脑','智能手机','高科技'],         
  ['苹果','公司','互联网','品质'],          
  ['乔布斯','苹果','硅谷'],          
  ['电脑','智能手机','苹果','乔布斯'],         
  ['苹果','电脑','品质','生意'],         
  ['电脑','品质','乔布斯'],          
  ['苹果','公司','生意','硅谷']]

In [3]:

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [4]:

numpy.random.seed(2019) # 设置随机种子以获得每次相同的结果。
model = ldamodel.LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=2)
model.show_topics(num_words=10) #展示主题模型，显示每个主题下的TOP10主题词

[(0,
  '0.170*"苹果" + 0.087*"电脑" + 0.064*"品质" + 0.063*"乔布斯" + 0.053*"智能手机" + 0.052*"水果" + 0.051*"互联网" + 0.049*"树上" + 0.049*"公司" + 0.042*"生意"'),
 (1,
  '0.093*"苹果" + 0.082*"植物" + 0.075*"维生素" + 0.074*"营养" + 0.060*"叶子" + 0.058*"落叶乔木" + 0.051*"成分" + 0.049*"绿色" + 0.048*"硅谷" + 0.045*"乔布斯"')]

In [5]:

model.num_terms

22

In [6]:

doc1 = ['树上', '叶子', '植物']
doc2 = ['乔布斯', '智能手机', '互联网']
doc3 = [ '营养', '苹果','维生素']

In [7]:

bow1 = model.id2word.doc2bow(doc1)  
bow2 = model.id2word.doc2bow(doc2)   
bow3 = model.id2word.doc2bow(doc3)  

In [9]:

lda_bow1 = model[bow1]
lda_bow2 = model[bow2]
lda_bow3 = model[bow3]

In [10]:

hellinger(lda_bow1, lda_bow3)

0.046444595428538994

In [11]:
hellinger(lda_bow1, lda_bow2)

0.4615085597971655

In [12]:

hellinger(lda_bow2, lda_bow3)

0.505161552997494

In [13]:

hellinger(lda_bow2, lda_bow3)

0.505161552997494

In [14]:

kullback_leibler(lda_bow1, lda_bow3)

0.008893967

In [15]:

kullback_leibler(lda_bow2, lda_bow3)

1.0357668

In [16]:

# 如您所见，两者的值不相等。我们稍后会介绍其中的细节
kullback_leibler(lda_bow3, lda_bow2)

1.1175619

In [18]:

print(model.get_document_topics(bow1))
print(model.get_document_topics(bow2))
print(model.get_document_topics(bow3))

[(0, 0.24727881), (1, 0.7527212)]
[(0, 0.8584863), (1, 0.14151369)]
[(0, 0.19269583), (1, 0.8073042)]


In [19]:

jaccard(bow1, bow2)

1.0

In [20]:

jaccard(doc1, doc2)

1.0

In [21]:
jaccard(['苹果','大树','营养'], ['苹果','大树','营养'])

0.0

In [22]:
topic_公司, topic_水果 = model.show_topics() 
#一些预处理，以使距离度量以可接受的数据形式来获得主题
def make_topics_bow(topic):
# 获取由model.show_topics（）返回的字符串，分割字符串以便分别获取主题和概率    
  topic = topic.split('+')#用于存储主题bows的list    
  topic_bow = []
  for word in topic:#分隔概率和词汇       
   prob, word = word.split('*')#去掉空格       
   word = word.replace(" ","")       
   word = word.replace('"','')#词汇表示转换
   #print(word)       
   word = model.id2word.doc2bow([word])[0][0]       
   topic_bow.append((word, float(prob)))
   return topic_bow

In [23]:

公司_distribution = make_topics_bow(topic_公司[1])
水果_distribution = make_topics_bow(topic_水果[1])
# 以主题词汇加权表示的“水果”主题
水果_distribution

[(3, 0.093)]

In [24]:

公司_distribution = make_topics_bow(topic_公司[1])
水果_distribution = make_topics_bow(topic_水果[1])
# 以主题词汇加权表示的“水果”主题
水果_distribution

[(3, 0.093)]

In [25]:
kullback_leibler(公司_distribution,水果_distribution)

IndexError: index 3 is out of bounds for axis 0 with size 1

In [26]:

topic1, topic2 = model.show_topics(num_words=len(model.id2word))
#再次进行词袋表示转换
公司_distribution = make_topics_bow(topic1[1])
水果_distribution = make_topics_bow(topic2[1])
# 返回kullback_leibler值
kullback_leibler(水果_distribution, 公司_distribution,22)

0.0

In [27]:

# 常规的Hellinger
hellinger(水果_distribution, 公司_distribution)

0.07590900821378677

In [28]:
hellinger(公司_distribution, 水果_distribution)

0.07590900821378677

In [29]:
hellinger(水果_distribution, 水果_distribution)

0.0

In [30]:
hellinger(lda_bow1, lda_bow2)

0.4615085597971655

In [31]:

hellinger(lda_bow1, lda_bow2) + hellinger(lda_bow1, lda_bow3)

0.5079531552257045

In [32]:
kullback_leibler(水果_distribution, 公司_distribution,22)

0.0

In [33]:

kullback_leibler(公司_distribution, 水果_distribution,22)

0.0