In [75]:
from pyhanlp import *
import pandas as pd
import numpy as np
from gensim import *
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from gensim.models import CoherenceModel
import pickle
from googletrans import Translator

### Load in models and corpus

In [76]:
#topic_nums = [10,20,30,40,50,75,100,150,200,300]

topic_nums = [10,50]
# load LDA models

LDAmodels = {}
for i in topic_nums:
    LDAmodels[i]= models.LdaModel.load('topic_models_without_places/topic_bow_train{}'.format(i))
# load corpus 15:20
with open('corpus_fifteen.dms', 'rb') as f:
    corpus_list = pickle.load(f)

# List 2

### Documents to Topics

In [77]:
# One list for each document
# crete an empty list to fill in by lists of each document, representing the network from doc to topic
news_topics = []
# loop each document
for c in corpus_list:
    # crete an empty list to fill in by dictionaries of all topics for this document
    list_c = []
    # loop models
    for i in topic_nums:
        model = LDAmodels[i]
        # loop topic proportions
        for r in model[c]:
            d = {}
            d['Model'] = i
            d['TopicIndex'] = str(i) + '_' + str(r[0])
            d['TopicName'] = model.print_topics(num_topics = i, num_words = 3)[r[0]][1]
            d['TopicProportion'] = r[1]
            list_c.append(d)
    news_topics.append(list_c)

In [85]:
with open('list2_doc_topic_list.pickle', 'wb') as f:
    pickle.dump(news_topics, f)

# List 4

In [82]:
list_topic = []
for i in topic_nums: 
    model = LDAmodels[i] # 某一个model，比如model30
    for topic in range(i):
        d = {}
        d['Model'] = i
        d['TopicIndex'] = str(i) + '_' + str(topic)
        d['TopicName'] = model.print_topics(num_topics =topic+1,num_words = 3)[topic][1]
        list_topic.append(d)

In [86]:
with open('list4_topic.pickle', 'wb') as f:
    pickle.dump(list_topic, f)

# List 3

### Topics to Keywords

In [87]:
# entity we choose to use in network    
words = ['ni','nic','nit','nr','nrf','ns','nsf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu']

meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'ns':'place', 'nsf':'place', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university'}

In [90]:
import datetime
start = datetime.datetime.now()
#run_function():
# One list for each topic
# crete an empty list to fill in by lists of topics, representing the network from topic to word
topics_words = []
# loop each model
for i in topic_nums:
    model = LDAmodels[i]
    # loop each topic:
    for j in range(i):
        word_list = []
        key_words = model.show_topics(formatted=False, num_topics = i, num_words = 10)[j][1]
        for k in key_words:
            pos = HanLP.segment(k[0]).toString().split('/')[1].replace(']', '')
            if pos in words:
                d = {}
                d['Model'] = i
                d['TopicIndex'] = str(i) + '_' + str(j)
                d['WordName'] = k[0]
                d['WordType'] = meaning[pos]
                d['WordWeight'] = k[1]
                word_list.append(d)
        topics_words.append(word_list)
end = datetime.datetime.now()
print (end-start)

0:00:05.934958


In [92]:
with open('list3_topic_word_list.pickle', 'wb') as f:
    pickle.dump(topics_words, f)

# List 5

###  News to words(with or without noun)

In [96]:
# Importing the dataset
path = 'chinese-newspaper-data/trade-news.csv'
whole_dateset = pd.read_csv(path)
dataset = whole_dateset[15:20]
dataset.reset_index(drop=True, inplace=True)

In [97]:
# Delete Space
content_nospace = [cont.replace(' ','') for cont in dataset['content']]
# resegmentation, result is a list of words with their POS tagging for each document
content_seg = [str(HanLP.segment(cont)).split(', ') for cont in content_nospace]
# futher cleaning (delete '[' and ']' )
content_seg = [[cont1.replace('[','').replace(']','') for cont1 in cont] for cont in content_seg]

In [130]:
# entity we choose to use in network    
words = ['ni','nic','nit','nr','nrf','ns','nsf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu','n']

meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'ns':'place', 'nsf':'place', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university','n':'noun'}

def name_entity(arr):
        word_list = []
        ner = words 
        for x in arr:
            result = {}
            temp = x.split("/")
            if(temp[1] in ner):
                result['WordName'] = temp[0]
                result['WordType'] =  meaning[temp[1]]
                word_list.append(result)
        return word_list

news_words_list = [name_entity(set(doc)) for doc in content_seg] 

In [132]:
with open('list5_news_word_list.pickle', 'wb') as f:
    pickle.dump(news_words_list, f)

# List 1

In [134]:
# list of dictionary: list of news. List #1
list_news = []
for i in range(len(dataset)):
    list_news.append( {'NewsName':dataset.title[i],\
                       'newspaper':dataset.newspaper[i],\
                       'date':dataset.date[i]})
    # news_list
list_news

[{'NewsName': '“文化中国·四海同春”精彩亮相', 'newspaper': '甘肃日报', 'date': '2012-01-31'},
 {'NewsName': '巴基斯坦总理吉拉尼称本国政治危机已经缓解',
  'newspaper': '甘肃日报',
  'date': '2012-01-31'},
 {'NewsName': '伊朗称“不久”将停止向欧洲某些国家供油',
  'newspaper': '甘肃日报',
  'date': '2012-01-31'},
 {'NewsName': '讨论《政府工作报告（征求意见稿）》', 'newspaper': '甘肃日报', 'date': '2012-02-01'},
 {'NewsName': '欧盟25国通过“财政契约”草案', 'newspaper': '甘肃日报', 'date': '2012-02-01'}]

In [135]:
with open('list1_news.pickle', 'wb') as f:
    pickle.dump(list_news, f)