In [1]:
import pandas as pd


In [75]:
from pandas import DataFrame
df: DataFrame = pd.read_table('./data/ai.ja.txt.parsed', sep='\t|,', header=None, usecols=[0, 1, 2, 7], names=['surface', 'pos', 'pos1', 'base'], skiprows=1)
df = df[4:-1]
df.head(20)

  return func(*args, **kwargs)


Unnamed: 0,surface,pos,pos1,base
4,* 0 17D 1/1 0.388993,,,
5,人工,名詞,一般,人工
6,知能,名詞,一般,知能
7,* 1 17D 2/3 0.613549,,,
8,（,記号,括弧開,（
9,じん,名詞,一般,じん
10,こうち,名詞,一般,こうち
11,のう,助詞,終助詞,のう
12,、,記号,読点,、
13,、,記号,読点,、


In [76]:
# 40. 係り受け解析結果の読み込み（形態素）
# 41. 係り受け解析結果の読み込み（文節・係り受け）

from typing import List

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'.format(self.surface, self.base, self.pos, self.pos1)

class Chunk():
    def __init__(self, morphs, dst):
        self.morphs: List[Morph] = morphs
        self.dst = dst
        self.srcs = set()

    def __str__(self):
        return 'morphs[{}]\tdst[{}]\tsrcs[{}]'.format(self.morphs, self.dst, self.srcs)

    def surface(self):
        return ''.join([morph.surface for morph in self.morphs])


In [77]:
from typing import Tuple

def parse_cabocha_str(line: str) -> Tuple[str, str]:
    """
    cabochaの解析結果をタプルで返す
    """
    id, dst = tuple(line.split(' ')[1:3])
    return int(id), int(dst.replace('D', ''))

test_cabocha_str = '* 2 3D 0/0 0.758984'
parse_cabocha_str(test_cabocha_str)

(2, 3)

In [78]:
from pandas import Series
def is_cabocha_str(line: Series) -> bool:
    """
    cabochaの解析結果のchunkの行かどうかを判定する
    """
    return line['surface'][0] == '*' and line['pos'] is None

test_cabocha_series = pd.Series(['* 2 3D 0/0 0.758984', None, None, None], index=['surface', 'pos', 'pos1', 'base'])
is_cabocha_str(test_cabocha_series)

True

In [79]:
def is_eos(line: Series) -> bool:
    return line['surface'] == 'EOS'

test_eos_series = pd.Series(['EOS', None, None, None], index=['surface', 'pos', 'pos1', 'base'])
is_eos(test_eos_series)

True

In [80]:
from typing import List

def dataframe_to_sentences(df: DataFrame) -> List[List[Chunk]]:
    sentences = []
    sentence: List[Chunk] = []
    chunk = None
    for i, row in df.iterrows(): # 行をTransposeしたSeriesを返す
        if (is_eos(row)):
            # append
            chunk is not None and sentence.append(chunk)
            len(sentence) > 0 and sentences.append(sentence)
            # reset
            sentence = []
            chunk = None
        elif (is_cabocha_str(row)):
            # append
            chunk is not None and sentence.append(chunk)
            # reset and set
            id, dst = parse_cabocha_str(row['surface'])
            morphs: List[Morph] = []
            chunk = Chunk(morphs, dst)
        else:
            morph = Morph(row['surface'], row['base'], row['pos'], row['pos1'])
            morphs.append(morph)
            chunk.morphs = morphs
    return sentences

sentences = dataframe_to_sentences(df)


In [82]:
def embed_src_to_sentence(sentence: List[Chunk]):
    for i, chunk in enumerate(sentence):
        dst_chunk = sentence[chunk.dst]
        dst_chunk.srcs.add(i)

for sentence in sentences:
    embed_src_to_sentence(sentence)


In [83]:
# 42. 係り元と係り先の文節の表示

def extract_src_dst(sentence: List[Chunk]) -> List[str]:
    src_dst_list = []
    for chunk in sentence:
        src_chunks = [sentence[src] for src in chunk.srcs]
        src_dst_list.append(chunk.surface() + '\t' + '\t'.join([sc.surface() for sc in src_chunks]))
    return src_dst_list

src_dst_2d_list = [extract_src_dst(sentence) for sentence in sentences]
src_dst_list = [item for sublist in src_dst_2d_list for item in sublist]

# FIXME: 句読点の削除


In [88]:
# 43. 名詞を含む文節が動詞を含む文節に係るものを抽出

def has_pos(chunk: Chunk, pos: str) -> bool:
    return any([morph.pos == pos for morph in chunk.morphs])

has_pos(sentences[0][0], '名詞')


True

In [92]:
def extract_noun_depends_on_verb(sentence: List[Chunk]) -> List[str]:
    noun_depends_on_verb_list = []
    for chunk in sentence:
        if has_pos(chunk, '名詞'):
            dst_chunk = sentence[chunk.dst]
            if has_pos(dst_chunk, '動詞'):
                noun_depends_on_verb_list.append(chunk.surface() + '\t' + dst_chunk.surface())
    return noun_depends_on_verb_list

noun_depends_on_verb_2d_list = [extract_noun_depends_on_verb(sentence) for sentence in sentences]
noun_depends_on_verb_list = [item for sublist in noun_depends_on_verb_2d_list for item in sublist]
# FIXME: 句読点の削除


In [93]:
from graphviz import Digraph
dot = Digraph(comment='The Round Table')
dot  #doctest: +ELLIPSIS
dot.node('A', 'King Arthur')
dot.node('B', 'Sir Bedevere the Wise')
dot.node('L', 'Sir Lancelot the Brave')
dot.edges(['AB', 'AL'])
dot.edge('B', 'L', constraint='false')
print(dot.source)  # doctest: +NORMALIZE_WHITESPACE
dot.render('data/round-table.gv', view=True)

// The Round Table
digraph {
	A [label="King Arthur"]
	B [label="Sir Bedevere the Wise"]
	L [label="Sir Lancelot the Brave"]
	A -> B
	A -> L
	B -> L [constraint=false]
}



'test-output/round-table.gv.pdf'

In [96]:
# 44. 係り受け木の可視化
# 始めから有向グラフで抽出してくれるライブラリはないだろうか...

def sentence_to_graph(sentence: List[Chunk]) -> Digraph:
    graph = Digraph(comment='The Round Table')
    for i, chunk in enumerate(sentence):
        graph.node(str(i), chunk.surface())
        graph.edge(str(i), str(chunk.dst))
    return graph

sentence_to_graph(sentences[0]).render('data/係り受け木.gv', view=True)

'data/係り受け木.gv.pdf'

In [None]:
# 45. 動詞の格パターンの抽出

