In [148]:
import pandas as pd
import datetime
# pip install py2neo
from py2neo import Graph,Node,Relationship,PropertyDict,Subgraph,NodeMatcher

# Prepare inputs: Lists

### List #1: list of dictionary: list of news. 

In [None]:
# 大的list是由很多的dictionary组成的
# 每个dictionary是news的信息

# 用来建立News Node(NewsName,Date,Newspaper,#NewsIndex)

In [286]:
list_news_topic = pd.read_pickle("list1_news.pickle")

In [287]:
#list_news_topic

### List #2: list of list of dictionary: list of news, list of topic and news

In [None]:
# 大的list是由很多list组成的，
# 每一个list是一个news，这个list是由很多dictionary组成的，每个dictionary是这个news包含的topic,
# 不光包括一个model的topic，包括所有model的topic
# 比如topic10_1,topic30_5
# 顺序是model从少到多，index从小到大

# 用来建立 Topic Node(Model,TopicIndex,TopicName)
# 用来建立 News_Covers_Topic Relationship（TopicProportion）

In [299]:
# Get topic data from LJY's file
list_news_topic = pd.read_pickle("list2_doc_topic_list.pickle")

In [288]:
#list_news_topic[0] # news1包含的所有的topic

### List #3: list of list of dictionary: list of topic, list of topic and words

In [None]:
# 大的list是由很多的list组成的
# 每个list是一个topic的信息，由很多dictionary组成的
# topics的顺序是model从少到多，index从小到大
# 每个dictionary是一个topic中的某一个word的的信息，包括wordname，wordtype
# 还有wordweight用来建立topic和word之间的relationship

# 用来建立 Word Node（WordName,WordType）
# 用来建立 Topics_Contains_Word Relationship(WordWeight)

In [289]:
list_topic_word =  pd.read_pickle('list3_topic_word_list.pickle')
#list_topic_word[1]

### List #4: list of topic: list of topic

In [None]:
# 大的list是由很多的dictionary组成的
# 每个dictionary是一个topic的信息，包括model，topicsname，topicindex
# topics的顺序是model从少到多，index从小到大

# 用来建立 Topic Node(Model,TopicIndex,TopicName)

In [290]:
list_topic =  pd.read_pickle('list4_topic.pickle')
#list_topic

### List #5: list of list of dictionary: list of news, list of word in this news

In [255]:
# 大的list是由很多的list组成的
# 每个list是一个news的信息，由很多dictionary组成的
# 每个dictionary是一个news中的某一个word的的信息，包括wordname，wordtype

# 用来建立 Word Node（WordName,WordType）
# 用来建立 News_Has_Word Relationship

In [267]:
list_news_word =  pd.read_pickle('list5_news_word_list.pickle')

# Build Neo4j

In [301]:
g = Graph('http://localhost:7474',username='neo4j',password='test')
g.delete_all()

start = datetime.datetime.now()

tx = g.begin()

### Build relationship 2: Topic contains word

for i in range(len(list_topic)):
    topic = list_topic[i]
   # Build node of news
    topic_node = Node("Topic",**topic)
    topic_node.__primarylabel__ = "Topic"
    topic_node.__primarykey__ = "TopicIndex"
    tx.merge(topic_node)
    
    for word in list_topic_word[i]:   
        type_ = word['WordType']
        weight = word['WordWeight']
        # Build node of topic
        word_node = Node(type_,WordName=word['WordName'])
        word_node.__primarylabel__ = type_
        word_node.__primarykey__ = "WordName" 
        tx.merge(word_node)
        
        # Build topic-word relationship
        Topic_Word = Relationship.type("Topic_Contains_Word")
        tx.merge(Topic_Word(topic_node,word_node,WordWeight=str(weight)))

### Build relationship 1&3: 

for i in range(len(list_news)):
    news = list_news[i]
   # Build node of news
    news_node = Node("News",**news)
    news_node.__primarylabel__ = "News"
    news_node.__primarykey__ = "NewsName"
    tx.merge(news_node)
       
    ### Build Relationship 1: News covers topic
    
    for topic in list_news_topic[i]:   
       
        # Topic node: only remains attribute: Model,TopicIndex,TopicName
        new_topic_node = dict(topic)
        del(new_topic_node[('TopicProportion')])
        
        # Build node of topic
        topic_node = Node('Topic',**new_topic_node)
        topic_node.__primarylabel__ = 'Topic'
        topic_node.__primarykey__ = "TopicIndex"    
        tx.merge(topic_node)
        
        # Build topic-news relationship
        Topic_News = Relationship.type("News_Covers_Topic")
        tx.merge(Topic_News(news_node,topic_node,\
                            TopicProportion=str(topic['TopicProportion'])))
        
    # Build Relationship 3 : News_Has_Word
    
    for word in list_news_word[i]:   
       
        # Build node of word
        type_ = word['WordType']
        word_node = Node(type_,WordName=word['WordName'])
        word_node.__primarylabel__ = type_
        word_node.__primarykey__ = "WordName"    
        tx.merge(word_node)
        
        # Build word-news relationship
        Word_News = Relationship.type("News_Has_Word")
        tx.merge(Word_News(news_node,word_node))
           
tx.commit() 

end = datetime.datetime.now()
print ('Time Spent:')
print (end-start)

Time Spent:
0:00:03.704862


# Query

In [302]:
# 查找某一个type
g.nodes.match("university").first()
# 查找某一个type的数量
len(g.nodes.match("university"))

17

In [303]:
# 查找某个type的一些node
g.run("MATCH (a:Topic) RETURN a.Model, a.TopicIndex,a.TopicName LIMIT 2").data()

[{'a.Model': 10,
  'a.TopicIndex': '10_1',
  'a.TopicName': '0.025*"阿里巴巴" + 0.017*"京东" + 0.013*"腾讯"'},
 {'a.Model': 10,
  'a.TopicIndex': '10_2',
  'a.TopicName': '0.019*"现代" + 0.012*"奥巴马" + 0.012*"特朗普"'}]

In [305]:
# 根据某个条件，查找某个type的一些node
g.run("MATCH (a:Topic) WHERE a.Model={x} RETURN a.TopicName LIMIT 5", x=10).data()

[{'a.TopicName': '0.025*"阿里巴巴" + 0.017*"京东" + 0.013*"腾讯"'},
 {'a.TopicName': '0.019*"现代" + 0.012*"奥巴马" + 0.012*"特朗普"'},
 {'a.TopicName': '0.014*"国际会展中心" + 0.007*"公安部" + 0.006*"市工商局"'},
 {'a.TopicName': '0.016*"现代" + 0.012*"中国银行" + 0.007*"控股集团"'},
 {'a.TopicName': '0.072*"习近平" + 0.039*"联合国" + 0.024*"新华社"'}]