In [5]:
import networkx  as nx
import pandas as pd
from pyhanlp import *
from itertools import combinations as comb

### import data

In [6]:
# Importing the dataset
path = 'chinese-newspaper-data/trade-news.csv'
whole_dateset = pd.read_csv(path)

In [7]:
dataset = whole_dateset[15:20]
dataset.reset_index(drop=True, inplace=True)

### Segment

In [46]:
# Delete Space
content_nospace = [cont.replace(' ','') for cont in dataset['content']]
# resegmentation, result is a list of words with their POS tagging for each document
content_seg = [str(HanLP.segment(cont)).split(', ') for cont in content_nospace]
# futher cleaning (delete '[' and ']' )
content_seg = [[cont1.replace('[','').replace(']','') for cont1 in cont] for cont in content_seg]


### Extract word New version: Separate Different Entity

In [9]:
# entity we choose to use in network    
words = ['ni','nic','nit','nr','nrf','ns','nsf','nt','ntc','ntcb','ntcf','ntch',
         'nth','nto','nts','ntu']

meaning = {'ni':'organization', 'nic': 'subordinate organization', 
           'nit': 'educational institution', 'nr': 'person',
           'nrf': 'person', 'ns':'place', 'nsf':'place', 'nt': 'organization',
           'ntc':'company','ntcb':'bank','ntcf':'factory','ntch':'hotel',
           'nth':'hospital','nto':'government','nts':'middle and primary school',
           'ntu':'university'}

# extract words we want according to it's POS tagging
def get_words_and_POS(arr):
        result = {}
        ner = words 
        for x in arr:
            temp = x.split("/")
            if(temp[1] in ner):
                result[temp[0]] = meaning[temp[1]]
        return result

# filtered list of words for each document
words_list = [get_words_and_POS(doc) for doc in content_seg]


### Connect to neo4j and build

In [11]:
# pip install py2neo
from py2neo import Graph,Node,Relationship,PropertyDict,Subgraph,NodeMatcher

In [12]:
news_list = []
for i in range(len(dataset)):
    news_list.append( {'NewsName':str(i),'newspaper':dataset.newspaper[i],'date':dataset.date[i]}   )

In [98]:
g = Graph('http://localhost:7474',username='neo4j',password='test')

g.delete_all()
tx = g.begin()

for i in range(len(news_list)):
    news = news_list[i]
    news_node = Node("News",**news)
    news_node.__primarylabel__ = "News"
    news_node.__primarykey__ = "NewsName"
    tx.merge(news_node)
    
    for word in words_list[i]:
        type_ = words_list[i][word]
        
        words_node = Node(type_,WordName=word)
        words_node.__primarylabel__ = type_
        words_node.__primarykey__ = "WordName"    
        tx.merge(words_node)
        
        Has = Relationship.type("has")
        tx.merge(Has(news_node,words_node))
    
tx.commit() 

### Topic: Preparation

In [78]:
# Get topic data from Tony's file
topic_data = pd.read_pickle("topic_50_words.dms")

In [79]:
# Change topic data to proper format (List of dictionary)
topic_word_entity = []

for topic in topic_data:
    topic_word = []
    for word in topic:
        word_entitiy = str(HanLP.segment(word)).replace('[','').replace(']','') 
        topic_word.append(word_entitiy)
    topic_word_entity.append(topic_word)
    
def get_words_and_POS(arr):
        result = {}
        ner = words 
        for x in arr:
            temp = x.split("/")
            if(temp[1] in ner):
                result[temp[0]] = meaning[temp[1]]
        return result
    
topic_word_entity_list = [get_words_and_POS(doc) for doc in topic_word_entity]
#topic_word_entity_list

In [96]:
topic_list = []
for i in range(len(topic_data)):
    topic_list.append( {'TopicName':'Topic '+ str(i)}) # We can add the inference of Topic like the topic nick name

### Topic: Build neo4j

In [99]:
g = Graph('http://localhost:7474',username='neo4j',password='test')

tx = g.begin()

for i in range(len(topic_list)):
    topics = topic_list[i]
    topics_node = Node("Topics",**topics)
    topics_node.__primarylabel__ = "Topics"
    topics_node.__primarykey__ = "TopicName"
    tx.merge(topics_node)
    
    for word in topic_word_entity_list[i]:
        type_ = topic_word_entity_list[i][word]
        
        words_node = Node(type_,WordName=word)
        words_node.__primarylabel__ = type_
        words_node.__primarykey__ = "WordName"    
        tx.merge(words_node)
        
        Has = Relationship.type("contains")
        tx.merge(Has(topics_node,words_node))
    
tx.commit() 