In [None]:
## Chapter04から使用するテキストデータをコピーする（事前にChapter04/Sentiment.ipynbを実行してください）
!cp -r ../Chapter04/text ./
!cp -r ../Chapter04/sisyou_db ./

In [None]:
import spacy
from spacy.matcher import Matcher
import pandas as pd
import collections

nlp = spacy.load("ja_ginza")
matcher = Matcher(nlp.vocab)


In [None]:
def extract_words(sent, pos_tags, stopwords):
    words = [token.lemma_ for token in sent
             if token.pos_ in pos_tags and token.lemma_ not in stopwords]
    return words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def count_cooccurrence(tokens):    
    count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
    X = count_model.fit_transform(tokens)
    words = count_model.get_feature_names()
    word_counts = np.asarray(X.sum(axis=0)).reshape(-1)

    X[X > 0] = 1 # limit to 1 occurrence in a document.
    Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
    return words, word_counts, Xc, X

In [None]:
def word_weights(words, word_counts):
    count_max = word_counts.max()
    weights = [(word, {'weight': count / count_max})
               for word, count in zip(words, word_counts)]
    return weights

def cooccurrence_weights(words, Xc):
    Xc_max = Xc.max()
    cutoff = 0.01 * Xc_max
    weights = [(words[i], words[j], Xc[i,j] / Xc_max)
               for i, j in zip(*Xc.nonzero()) if i < j and Xc[i,j] > cutoff]
    return weights

In [None]:
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt

def create_network(words, word_counts, Xc):
    G = nx.Graph()
    
    weights_w = word_weights(words, word_counts)
    G.add_nodes_from(weights_w)
    
    weights_c = cooccurrence_weights(words, Xc)
    G.add_weighted_edges_from(weights_c)
    
    G.remove_nodes_from(list(nx.isolates(G)))
    return G

def pyplot_network(G):
    plt.figure(figsize=(10, 10))
    pos = nx.spring_layout(G, k=0.1)

    weights_n = np.array(list(nx.get_node_attributes(G, 'weight').values()))
    nx.draw_networkx_nodes(G, pos, node_size=300 * weights_n)
        
    weights_e = np.array(list(nx.get_edge_attributes(G, 'weight').values()))
    nx.draw_networkx_edges(G, pos, width=20 * weights_e)
    
    nx.draw_networkx_labels(G, pos, font_family='IPAexGothic')

    plt.axis("off")
    plt.show()

def nx2pyvis_G(G):
    pyvis_G = Network(width='800px', height='800px', notebook=True)
    # pyvis_G.from_nx(G) # pyvisライブラリ現状では，属性が反映されない．
    for node, attrs in G.nodes(data=True):
        pyvis_G.add_node(node, title=node, size=30 * attrs['weight'])
    for node1, node2, attrs in G.edges(data=True):
        pyvis_G.add_edge(node1, node2, width=20 * attrs['weight'])
    return pyvis_G

In [None]:
input_fn = "text/kageotoko.corpus.txt"
with open(input_fn) as f:
    text = f.read()

include_pos = ('PROPN')
stopwords = ()

doc = nlp(text)
sents = [' '.join(extract_words(sent, include_pos, stopwords))
          for sent in doc.sents]
words, word_counts, Xc, X = count_cooccurrence(sents)

In [None]:
G = create_network(words, word_counts, Xc)
pyplot_network(G)
pyvis_G = nx2pyvis_G(G)
pyvis_G.show("mygraph.html")

In [None]:
patterns = [[{"POS": "NOUN"}] * n for n in [2,3,4]]
patterns.extend([[{"POS": "PROPN"}] * n for n in [2,3,4]])
patterns.append([{"POS": "NOUN"},{"POS": "PROPN"}])
patterns.append([{"POS": "NOUN"},{"POS": "PROPN"},{"POS": "PROPN"}])
patterns.append([{"POS": "PROPN"},{"POS": "NOUN"}])
patterns.append([{"POS": "PROPN"},{"POS": "NOUN"},{"POS": "NOUN"}])
## GiNZA 5.0.X
#for pattern in patterns:
#    name = f'noun_phrase_{len(pattern)}'
#    matcher.add(name, [pattern])

# GiNZA 4.0.X
for pattern in patterns:
    name = f'propn_{len(pattern)}'
    matcher.add(name, None, pattern)

In [None]:
input_fn = "text/kageotoko.corpus.txt"

with open(input_fn) as f:
    text = f.read()

In [None]:
counter = collections.Counter()
for doc in nlp.pipe([text]):
    nps = [doc[begin:end].text for _, begin, end in matcher(doc)]
    counter.update(nps)

In [None]:
with open("characters_raw.txt","w") as fout:
    for word, count in counter.most_common(200):
        fout.write("{}\n".format(word))

## 辞書のシーズとなる人名リストを作成
* cp characters_raw.txt characters.txt
* characters.txt を目視でチェックして、人名のみを残す

In [None]:
## sudachipy用の辞書ファイルを作成

file_character = "characters.txt"
file_dic = "dic_characters.txt"


with open(file_dic,"w") as fout:
     for word in open(file_character):
          word = word.rstrip()
          fout.write("{},4789,4789,5000,{},名詞,固有名詞,一般,*,*,*,*,{},*,*,*,*,*\n".format(word,word,word))
          print("{},4789,4789,5000,{},名詞,固有名詞,一般,*,*,*,*,{},*,*,*,*,*".format(word,word,word))

In [None]:
## ユーザ辞書(user.dic)の生成
!sudachipy ubuild -s ~/.pyenv/versions/3.9.5/lib/python3.9/site-packages/sudachidict_core/resources/system.dic  dic_characters.txt

## sudachi.jsonを編集
!vi  ~/.pyenv/versions/3.9.5/lib/python3.9/site-packages/sudachipy/resources/sudachi.json 
```
"characterDefinitionFile" : "char.def",
"userDict" : ["/content/user.dic"],      ← ユーザー辞書ファイルのパスを指定
```

In [None]:
text = "明智小五郎っていう私立探偵知ってるでしょう？"

nlp = spacy.load("ja_ginza")
doc = nlp(text)
for token in doc:
    print(token,token.pos_,token.lemma_)


In [None]:
input_fn = "text/kageotoko.corpus.txt"
with open(input_fn) as f:
    text = f.read()

include_pos = ('PROPN')
stopwords = ()

doc = nlp(text)
sents = [' '.join(extract_words(sent, include_pos, stopwords))
          for sent in doc.sents]
words, word_counts, Xc, X = count_cooccurrence(sents)

In [None]:
G = create_network(words, word_counts, Xc)
pyplot_network(G)
pyvis_G = nx2pyvis_G(G)
pyvis_G.show("mygraph.html")