# ginzaの使い方

## 単語の共起

単語が出現する頻度を単独で調べるだけでなく，単語間の関係を調べることもできる．
共起を求める関数を定義し，題材の小説「影男」で共起を求めてみる．

In [None]:
import spacy

input_fn = 'text/kageotoko.corpus.txt'

include_pos = ('NOUN', 'VERB', 'ADJ')
stopwords = ('する', 'ある', 'ない', 'いう', 'もの', 'こと', 'よう', 'なる', 'ほう', 'いる', 'くる')

nlp = spacy.load("ja_ginza")

In [None]:
def extract_words(sent, pos_tags, stopwords):
    words = [token.lemma_ for token in sent
             if token.pos_ in pos_tags and token.lemma_ not in stopwords]
    return words

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def count_cooccurrence(sents, token_length='{2,}'):
    token_pattern=f'\\b\\w{token_length}\\b'
    count_model = CountVectorizer(token_pattern=token_pattern)

    X = count_model.fit_transform(sents)
    words = count_model.get_feature_names_out()
    word_counts = np.asarray(X.sum(axis=0)).reshape(-1)

    X[X > 0] = 1 # limit to 1 occurrence in a document.
    Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
    return words, word_counts, Xc, X

In [None]:
with open(input_fn) as f:
    text = f.read()

doc = nlp(text)
sents = [' '.join(extract_words(sent, include_pos, stopwords))
          for sent in doc.sents]
words, _, Xc, X = count_cooccurrence(sents)

共起頻度の高い順に10個表示する．

In [None]:
from collections import Counter
counter = Counter()

for i, j in zip(*Xc.nonzero()):
    if i >= j:
        continue
    counter[(i,j)] += Xc[i,j]

for (i,j), c in counter.most_common(10):
    print('{:>3d} ({}, {})'.format(c, words[i], words[j]))

「世界」と「地底」が共起する原文を表示する．

In [None]:
def find_sentence_by_cooccurrence(X, idxs):
    occur_flags = (X[:,idxs[0]] > 0)
    for idx in idxs[1:]:
        occur_flags = occur_flags.multiply(X[:,idx] > 0)
    return occur_flags.nonzero()[0]

sents_orig = list(doc.sents)
words_lookup = {word: index for index, word in enumerate(words)}
idxs = [words_lookup[word] for word in ['世界', '地底']]

for i in find_sentence_by_cooccurrence(X, idxs):
    print("{:>5d}: {}".format(i, sents_orig[i]))