In [None]:
from __future__ import unicode_literals


import networkx as nx
import en_core_web_sm
from spacy import displacy
from spacy.symbols import nsubj, VERB,  NOUN, PROPN, PRON, ADP, CCONJ,PUNCT

from spacy.errors import Errors
import matplotlib.pyplot as plt


import pandas as pd
import glob

import swifter

import sys, os, re
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

nlp = en_core_web_sm.load()
import collections

In [3]:
files = glob.glob('./incoPat_225509/*.xls')

dfs = []
for f in files:
    try:
        dfs.append(pd.read_excel(f))
    except Exception as e:
        print("{0}: {1}".format(f, e))

df = pd.concat(dfs)

In [4]:
df = df.reset_index(drop=True)

In [5]:
def noun_chunks(doclike):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = [
        "nsubj",
        "dobj",
        "nsubjpass",
        "pcomp",
        "pobj",
        "dative",
        "appos",
        "attr",
        "ROOT",
    ]
    doc = doclike.doc  # Ensure works on both Doc and Span.

    if not doc.is_parsed:
        raise ValueError(Errors.E029)

    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
            prev_end = word.i
            yield word.left_edge.i, word.i +1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                yield word.left_edge.i, word.i+1, np_label

In [6]:
# https://spacy.io/docs/usage/processing-text
def construct_graph(x, edges, labels):
    sl = (PUNCT, ADP, CCONJ)
    kl = (NOUN, PROPN, PRON)
    doc = nlp(x)
#     print('document: {0}'.format(doc))
    # Load spacy's dependency tree into a networkx graph
    with doc.retokenize() as retokenizer:
        for nc in list(noun_chunks(doc)):
            if nc[0] != nc[1]:
                retokenizer.merge(doc[nc[0]:nc[1]])
                edges.append((doc[nc[0]:nc[1]].lemma_.upper(), doc[nc[0]:nc[1]].lemma_.upper()))
    for token in doc:
        if token.pos not in kl:
            continue
        # FYI https://spacy.io/docs/api/token
        for child in token.children:
            if child.pos not in kl:
                continue
            # edges.append(('{0}-{1}'.format(token.lower_,token.i),'{0}-{1}'.format(child.lower_,child.i)))
            edges.append((token.lemma_.upper(), child.lemma_.upper()))
            #labels[(token.lower_, child.lower_)] = child.dep_
        # edges.append((token.head.lower_, token.lower_))
    return doc
# https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.shortest_paths.html
# print(nx.shortest_path_length(graph, source='robots-0', target='awesomeness-11'))
# print(nx.shortest_path(graph, source='robots-0', target='awesomeness-11'))
# print(nx.shortest_path(graph, source='robots-0', target='agency-15'))

##### 是否需要过滤低词频的词

In [220]:
edges = []
labels = {}
df['parsed_title'] = df['标题(英)'].apply(lambda x: construct_graph(x, edges, labels))
graph = nx.DiGraph(edges)

In [212]:
# end_nodes = [x for x in graph.nodes() if (graph.out_degree(x)<=1 and graph.in_degree(x)<=2) or (x.count(' ') >= 4 or  x.count(' ') < 1)]
graph.remove_edges_from(nx.selfloop_edges(graph))
end_nodes = [x for x in graph.nodes() if graph.in_degree(x)<=1] # TODO something wrong ,self edge exclude?
# TODO something wrong ,self edge exclude?
for en in end_nodes:
    graph.remove_node(en)

graph.remove_edges_from(nx.selfloop_edges(graph))
sub_nodes = []
for nd in graph.nodes():
    if graph.out_degree(nd) + graph.in_degree(nd) > 10:
        sub_nodes.append(nd)

In [219]:
len(graph.nodes())

78

In [214]:
nx.write_graphml(graph, './tmp/kws1000_graphml.graphml')

In [None]:
list(noun_chunks(nlp(u'ADDITIVELY MANUFACTURED HEAT TRANSFER DEVICE')))
doc = nlp(u'POLAR ADDITIVE FOR THE SYNTHESIS OF COPOLYMERS OF VINYLAROMATIC MONOMER AND CONJUGATED DIENE MONOMER HAVING HIGH VINYLAROMATIC AND LOW VINYL CONTENTS')
list(doc.noun_chunks)
# list(noun_chunks(doc))
doc = construct_graph(u'POLAR ADDITIVE FOR THE SYNTHESIS OF COPOLYMERS OF VINYLAROMATIC MONOMER AND CONJUGATED DIENE MONOMER HAVING HIGH VINYLAROMATIC AND LOW VINYL CONTENTS',[],[])
options = {"color": "white", "collapse_phrases" : True, "bg": "#000000"}
displacy.serve(doc, style="dep",options=options)

In [1]:
plt.figure(figsize=(64, 64))
nx.draw(graph, pos=nx.bipartite_layout(graph, sub_nodes),width=0.2, style='dashdot',with_labels=True) # pos=nx.circular_layout(graph),nx.spring_layout(graph,scale=4, k= 0.6)
# plt.show()
plt.savefig("./tmp/kws_graph.png")
#nx.draw_networkx_edge_labels(graph, pos = nx.spring_layout(graph), edge_labels = labels)

NameError: name 'plt' is not defined

---

In [None]:
pr = nx.pagerank_scipy(graph, alpha=0.9)

In [49]:
filter_dict = collections.OrderedDict()
re_ = r"^A |^THE |^THEIR |^ITS |^THIS |^AN |^SUCH A |^AS |^TO |^AND |^-PRON- |AT LEAST|USE THEREOF|^OR | THEREOF| THEREFOR| THERETO| THEREFROM| THEREBY"
for k, v in sorted(pr.items(), key=lambda item: item[1], reverse=True):
    k = re.sub(re_, "", k.upper())
    if  k.count(' ') < 4 and  k.count(' ') >= 1 and k.count('(') == 0 and k.count(')') == 0:
        filter_dict[k] = v
filter_df = pd.DataFrame.from_dict(filter_dict, orient='index', columns=['score'])

In [50]:
filter_df.head(1000).to_excel('./out/kws_dp_pagerank.xlsx')

In [320]:
h, a = nx.hits(graph)
{k: v for k, v in sorted(a.items(), key=lambda item: item[1], reverse=True)}

In [None]:
filter_dict

In [139]:
k = 10
graph.remove_edges_from(nx.selfloop_edges(graph))
sub_graph_k_core = nx.k_core(graph, k=k)

In [195]:
len(sub_graph_k_core.nodes())

72

In [141]:
nx.write_graphml(sub_graph_k_core, './tmp/sub_graph_k{0}.graphml'.format(k))

In [154]:
onion_dict = nx.onion_layers(graph.to_undirected())

In [None]:
onion_dict

In [173]:
ko = 1
onion_k = {k: v for k, v in onion_dict.items() if v==ko}
onion_k_graph = graph.subgraph(list(onion_k.keys()))

In [174]:
len(onion_k_graph.edges())

0

In [144]:
nx.write_graphml(onion_k_graph, './tmp/onion_k{0}.graphml'.format(ko))

In [221]:
kt =6
k_truss = nx.k_truss(graph.to_undirected(),kt)

In [222]:
len(k_truss.nodes())

230503

In [201]:
nx.write_graphml(k_truss, './tmp/k_truss_k{0}.graphml'.format(kt))