In [30]:
from pyhanlp import *
import pandas as pd
import numpy as np
from gensim import *
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from gensim.models import CoherenceModel
import pickle
from googletrans import Translator
import re
# pip install py2neo
from py2neo import Graph,Node,Relationship,PropertyDict,Subgraph,NodeMatcher

# Data loading and preparation

### Load in dataset, models and corpus

In [32]:
# Only use 5 documents to test the code 

In [29]:
# Importing the dataset
path = 'chinese-newspaper-data/trade-news.csv'
whole_dateset = pd.read_csv(path)
dataset = whole_dateset[15:20]
dataset.reset_index(drop=True, inplace=True)

In [2]:
# Load models from the 'topic_10_50_with_POS' file. 
topic_nums = [10,50]
LDAmodels = {}
for i in topic_nums:
    LDAmodels[i]= models.LdaModel.load('topic_10_50_with_POS/topic_bow_train{}'.format(i))
# load corpus 15:20
with open('corpus_fifteen.dms', 'rb') as f:
    corpus_list = pickle.load(f)

### Build function to get word, word type, change topic name

In [3]:
# Function to get word and wordtype
def get_word_wordtype(word_and_type):
    words = ['ni','nic','nit','nr','nrf','ns','nsf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu','n']

    meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'ns':'place', 'nsf':'place', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university','n':'noun'}
    word = word_and_type.split('/')[0]
    word_type = meaning[word_and_type.split('/')[1]]
    return(word,word_type)

In [4]:
# Function to get topic name: ‘0.145*"经济/n" + 0.032*"我国/n" + 0.022*"企业/n" ' to '经济 我国 企业'
def get_topic_name(long_topic_name):
    topic_name = re.sub('[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?、…【】《》？“”‘’！[\\]^_`{|}~\s]+', " ", \
                        long_topic_name).strip()
    return topic_name

# List #1: list of dictionary: list of news. 

This list is made of many dictionaries, which contain the informaction of each news. 

It is created to create News Node(NewsName,Date, Newspaper)

In [24]:
#大的list是由很多的dictionary组成的,每个dictionary是news的信息
#用来建立News Node(NewsName,Date,Newspaper)

In [None]:
# list of dictionary: list of news. List #1
list_news = []
for i in range(len(dataset)):
    list_news.append( {'NewsName':dataset.title[i],\
                       'newspaper':dataset.newspaper[i],\
                       'date':dataset.date[i]})
    # news_list
list_news

In [None]:
with open('list1_news.pickle', 'wb') as f:
    pickle.dump(list_news, f)

# List #2 list of list of dictionary: list of news, list of topic and news

This list is made of many lists which is the information of a news. Each news list is made of many dictionaries. Each dictionary is the topics covered by this news. 

It includes not only the topics from one topic model, for example, topic model with 10 topics, but includes the topics from other topic model for example, topic model with 50 topics as well.

This list is used to build Topic Node(Model,TopicIndex,TopicName) and News_Covers_Topic Relationship（TopicProportion）

In [23]:
#大的list是由很多list组成的，
#每一个list是一个news，这个list是由很多dictionary组成的，每个dictionary是这个news包含的topic,
#不光包括一个model的topic，包括所有model的topic
#比如topic10_1,topic50_5
#顺序是model从少到多，index从小到大

#用来建立 Topic Node(Model,TopicIndex,TopicName)
#用来建立 News_Covers_Topic Relationship（TopicProportion）

In [5]:
# One list for each document
# crete an empty list to fill in by lists of each document, representing the network from doc to topic
news_topics = []
# loop each document
for c in corpus_list:
    # crete an empty list to fill in by dictionaries of all topics for this document
    list_c = []
    # loop models
    for i in topic_nums:
        model = LDAmodels[i]
        # loop topic proportions
        for r in model[c]:
            d = {}
            d['Model'] = i
            d['TopicIndex'] = str(i) +  '_' +str(r[0])
            d['TopicName'] = get_topic_name(model.print_topics(num_topics = i, num_words = 3)[r[0]][1])
            d['TopicProportion'] = r[1]
            list_c.append(d)
    news_topics.append(list_c)

In [6]:
with open('list2_doc_topic_list.pickle', 'wb') as f:
    pickle.dump(news_topics, f)

In [20]:
news_topics[0]

[{'Model': 10,
  'TopicIndex': '10_3',
  'TopicName': '产业 企业 项目',
  'TopicProportion': 0.9525966},
 {'Model': 50,
  'TopicIndex': '50_9',
  'TopicName': '省 我省 全省',
  'TopicProportion': 0.1527766},
 {'Model': 50,
  'TopicIndex': '50_12',
  'TopicName': '规划 产业 新区',
  'TopicProportion': 0.33370557},
 {'Model': 50,
  'TopicIndex': '50_18',
  'TopicName': '产业 科技 人才',
  'TopicProportion': 0.27084327},
 {'Model': 50,
  'TopicIndex': '50_36',
  'TopicName': '城市 国际 市',
  'TopicProportion': 0.1942375}]

# List #3:  list of list of dictionary: list of topic, list of topic and words

This list is made of many lists which is the information of a topic. Each topic list is made of many dictionaries. Each dictionary is the information of words contained in this topic. The word information includes word name, word type (person/organization/government/company.....), weight of this word with the topic. 

This list is used to build Word Node (WordName,WordType) and Topics_Contains_Word Relationship(WordWeight)

In [22]:
#大的list是由很多的list组成的
#每个list是一个topic的信息，由很多dictionary组成的
#topics的顺序是model从少到多，index从小到大
#每个dictionary是一个topic中的某一个word的的信息，包括wordname，wordtype
#还有wordweight用来建立topic和word之间的relationship

#用来建立 Word Node(WordName,WordType)
#用来建立 Topics_Contains_Word Relationship(WordWeight)

In [10]:
import datetime
start = datetime.datetime.now()
#run_function():
# One list for each topic
# crete an empty list to fill in by lists of topics, representing the network from topic to word
topics_words = []
# loop each model
for i in topic_nums:
    model = LDAmodels[i]
    # loop each topic:
    for j in range(i):
        word_list = []
        key_words = model.show_topics(formatted=False, num_topics = i, num_words = 10)[j][1]
        for k in key_words:
            d = {}
            d['Model'] = i
            d['TopicIndex'] = str(i) + '_' + str(j)
            d['WordName'] = get_word_wordtype(k[0])[0]
            d['WordType'] = get_word_wordtype(k[0])[1]
            d['WordWeight'] = k[1]             
            word_list.append(d)
        topics_words.append(word_list)
end = datetime.datetime.now()
print (end-start)

0:00:09.090820


In [12]:
with open('list3_topic_word_list.pickle', 'wb') as f:
    pickle.dump(topics_words, f)

# List #4: list of topic: list of topic

This list is made of many dictionaries, which contain the informaction of each topic. The information contains model，topicsname，topicindex. 

It is created to create Topic Node(Model,TopicIndex,TopicName)

In [26]:
# 大的list是由很多的dictionary组成的
# 每个dictionary是一个topic的信息，包括model，topicsname，topicindex
# topics的顺序是model从少到多，index从小到大
# 用来建立 Topic Node(Model,TopicIndex,TopicName)

In [None]:
list_topic = []
for i in topic_nums: 
    model = LDAmodels[i] # 某一个model，比如model30
    for topic in range(i):
        d = {}
        d['Model'] = i
        d['TopicIndex'] = str(i) + '_' + str(topic)
        d['TopicName'] = get_topic_name(model.print_topics(num_topics =topic+1,num_words = 3)[topic][1])
        list_topic.append(d)

In [None]:
with open('list4_topic.pickle', 'wb') as f:
    pickle.dump(list_topic, f)

# List #5: list of list of dictionary: list of news, list of word in this news

This list is made of many lists which is the information of a news. Each news list is made of many dictionaries. 

Each dictionary is the information of words contained in this topic. The word information includes word name, word type (person/organization/government/company.....), weight of this word with the topic. 

This list is used to build Word Node (WordName,WordType) and News_Has_Word Relationship

In [28]:
# 大的list是由很多的list组成的
# 每个list是一个news的信息，由很多dictionary组成的
# 每个dictionary是一个news中的某一个word的的信息，包括wordname，wordtype

# 用来建立 Word Node（WordName,WordType）
# 用来建立 News_Has_Word Relationship

In [14]:
def text_preprocess(text):
    # Delete Space and truncate the text 
    text = text.replace(' ','')
    if len(text)>50:
        text = text[20:-20]
    # resegmentation, result is a list of words with their POS tagging for each document
    text = str(HanLP.segment(text)).replace('[','').replace(']','').split(', ')
    return text

content_seg = list(map(text_preprocess, dataset['content']))

In [15]:
# entity we choose to use in network    
words = ['ni','nic','nit','nr','nrf','ns','nsf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu','n']

meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'ns':'place', 'nsf':'place', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university','n':'noun'}

def name_entity(arr):
        word_list = []
        ner = words 
        for x in arr:
            result = {}
            temp = x.split("/")
            if(temp[1] in ner):
                result['WordName'] = temp[0]
                result['WordType'] =  meaning[temp[1]]
                word_list.append(result)
        return word_list

news_words_list = [name_entity(set(doc)) for doc in content_seg] 

In [16]:
with open('list5_news_word_list.pickle', 'wb') as f:
    pickle.dump(news_words_list, f)

# Load list inputs

In [31]:
# list 1 
list_news_topic = pd.read_pickle("list1_news.pickle")
# list 2 
list_news_topic = pd.read_pickle("list2_doc_topic_list.pickle")
# list 3
list_topic_word =  pd.read_pickle('list3_topic_word_list.pickle')
# list 4
list_topic =  pd.read_pickle('list4_topic.pickle')
# list 5
list_news_word =  pd.read_pickle('list5_news_word_list.pickle')

# Build network using neo4j

In [None]:
g = Graph('http://localhost:7474',username='neo4j',password='test')
g.delete_all()

start = datetime.datetime.now()

tx = g.begin()

### Build relationship 2: Topic contains word

for i in range(len(list_topic)):
    topic = list_topic[i]
   # Build node of news
    topic_node = Node("Topic",**topic)
    topic_node.__primarylabel__ = "Topic"
    topic_node.__primarykey__ = "TopicIndex"
    tx.merge(topic_node)
    
    for word in list_topic_word[i]:   
        type_ = word['WordType']
        weight = word['WordWeight']
        # Build node of topic
        word_node = Node(type_,WordName=word['WordName'])
        word_node.__primarylabel__ = type_
        word_node.__primarykey__ = "WordName" 
        tx.merge(word_node)
        
        # Build topic-word relationship
        Topic_Word = Relationship.type("Topic_Contains_Word")
        tx.merge(Topic_Word(topic_node,word_node,WordWeight=str(weight)))

### Build relationship 1:News covers topic and relationship 3: News_Has_Word

for i in range(len(list_news)):
    news = list_news[i]
   # Build node of news
    news_node = Node("News",**news)
    news_node.__primarylabel__ = "News"
    news_node.__primarykey__ = "NewsName"
    tx.merge(news_node)
       
    ### Build Relationship 1: News covers topic
    
    for topic in list_news_topic[i]:   
       
        # Topic node: only remains attribute: Model,TopicIndex,TopicName
        new_topic_node = dict(topic)
        del(new_topic_node[('TopicProportion')])
        
        # Build node of topic
        topic_node = Node('Topic',**new_topic_node)
        topic_node.__primarylabel__ = 'Topic'
        topic_node.__primarykey__ = "TopicIndex"    
        tx.merge(topic_node)
        
        # Build topic-news relationship
        Topic_News = Relationship.type("News_Covers_Topic")
        tx.merge(Topic_News(news_node,topic_node,\
                            TopicProportion=str(topic['TopicProportion'])))
        
    # Build Relationship 3 : News_Has_Word
    
    for word in list_news_word[i]:   
       
        # Build node of word
        type_ = word['WordType']
        word_node = Node(type_,WordName=word['WordName'])
        word_node.__primarylabel__ = type_
        word_node.__primarykey__ = "WordName"    
        tx.merge(word_node)
        
        # Build word-news relationship
        Word_News = Relationship.type("News_Has_Word")
        tx.merge(Word_News(news_node,word_node))
           
tx.commit() 

end = datetime.datetime.now()
print ('Time Spent:')
print (end-start)