### Preparation

In [12]:
import sys, os
import re
import CaboCha
import numpy as np

In [6]:
with open('data/neko.txt') as input_neko, \
         open('data/neko.txt.cabocha', mode='w') as output_neko:
    cabocha = CaboCha.Parser()
    for line in input_neko:
        output_neko.write(
            cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
        )

### 40. 係り受け解析結果の読み込み（形態素）

In [41]:
class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1
    def __str__(self):
        return 'Morph({}, [{}, {}, {}])'.format(self.surface, self.base, self.pos, self.pos1)

In [82]:
sentence = []
sentences = []
with open('data/neko.txt.cabocha') as neko_cabocha:
    for line in neko_cabocha:
        m = re.search(r'(?P<surface>.*?)\t(?P<pos>[^,]*),(?P<pos1>[^,]*),[^,]*,[^,]*,[^,]*,[^,]*,(?P<base>[^,]*).*', line)
        if m:
            # print(m.group('surface'), m.group('base'), m.group('pos'), m.group('pos1'))
            sentence.append(Morph(surface=m.group('surface'), base=m.group('base'), 
                                    pos=m.group('pos'), pos1=m.group('pos1')))
        elif line=='EOS\n' and not sentence == []:
            sentences.append(sentence)
            sentence = []
print([str(morph) for morph in sentences[3]])

['Morph(\u3000, [\u3000, 記号, 空白])', 'Morph(どこ, [どこ, 名詞, 代名詞])', 'Morph(で, [で, 助詞, 格助詞])', 'Morph(生れ, [生れる, 動詞, 自立])', 'Morph(た, [た, 助動詞, *])', 'Morph(か, [か, 助詞, 副助詞／並立助詞／終助詞])', 'Morph(とんと, [とんと, 副詞, 一般])', 'Morph(見当, [見当, 名詞, サ変接続])', 'Morph(が, [が, 助詞, 格助詞])', 'Morph(つか, [つく, 動詞, 自立])', 'Morph(ぬ, [ぬ, 助動詞, *])', 'Morph(。, [。, 記号, 句点])']


### 41. 係り受け解析結果の読み込み（文節・係り受け）
40に加えて，文節を表すクラスChunkを実装せよ．このクラスは形態素（Morphオブジェクト）のリスト（morphs），係り先文節インデックス番号（dst），係り元文節インデックス番号のリスト（srcs）をメンバ変数に持つこととする．さらに，入力テキストのCaboChaの解析結果を読み込み，１文をChunkオブジェクトのリストとして表現し，8文目の文節の文字列と係り先を表示せよ．第5章の残りの問題では，ここで作ったプログラムを活用せよ．



In [83]:
class Chunk:
    def __init__(self, morphs, dst, srcs):
        self.morphs = morphs
        self.dst = dst
        self.srcs = srcs
    def __str__(self):
        srcs = self.srcs
        dst = self.dst
        morphs = ''.join([morph.surface for morph in self.morphs])
        return '{}{}{}'.format(srcs, morphs, dst)

In [92]:
with open('data/neko.txt.cabocha') as neko_cabocha:
    chunks = []
    sentence, dst, srcs = [], None, []
    for line in neko_cabocha:
        # 形態素を取得
        m = re.search(r'(?P<surface>.*?)\t(?P<pos>[^,]*),(?P<pos1>[^,]*),[^,]*,[^,]*,[^,]*,[^,]*,(?P<base>[^,]*),.*', line)
        if m:
            sentence.append(Morph(surface=m.group('surface'), base=m.group('base'), 
                                    pos=m.group('pos'), pos1=m.group('pos1')))
            continue
        # 係り受け情報を取得    
        m1 = re.search(r'\*\ (?P<src>[0-9]*)\ (?P<dst>[0-9]*)D', line)
        if m1:
            print(m1.group('dst'), m1.group('src'))
            dst = m1.group('dst')
            srcs.append(src)
            sentence.append(Chunk(morphs=sentence, dst=dst, srcs=srcs))
        
        sentence, dst, srcs = [], None, []
            
        
print(len(chunks))
print(chunks[8])

2 0
2 1
2 0
2 1
1 0
4 1
4 2
4 3
1 0
3 1
3 2
5 3
5 4
7 5
7 6
5 0
2 1
3 2
4 3
5 4
8 0
2 1
8 2
8 3
5 4
8 5
7 6
8 7
1 0
7 1
4 2
4 3
5 4
6 5
7 6
9 0
2 1
5 2
4 3
5 4
9 5
7 6
9 7
9 8
3 0
2 1
3 2
5 3
5 4
6 5
8 6
8 7
9 8
10 9
1 0
3 1
3 2
6 3
5 4
6 5
8 6
8 7
9 8
10 9
1 0
3 1
3 2
4 3
5 4
7 5
7 6
1 0
2 1
3 2
5 3
5 4
3 0
3 1
3 2
9 3
5 4
9 5
7 6
8 7
9 8
4 0
2 1
4 2
4 3
5 0
2 1
5 2
5 3
5 4
1 0
3 1
3 2
4 0
2 1
3 2
4 3
5 4
9 5
9 6
8 7
9 8
9 0
2 1
3 2
4 3
8 4
6 5
7 6
8 7
9 8
14 9
11 10
14 11
13 12
14 13
1 0
3 1
3 2
4 3
7 4
7 5
7 6
1 0
1 0
2 1
8 2
5 3
5 4
8 5
8 6
8 7
1 0
7 1
7 2
4 3
7 4
6 5
7 6
6 0
3 1
3 2
4 3
6 4
6 5
1 0
2 1
4 2
4 3
1 0
3 1
3 2
1 0
2 1
3 2
5 3
5 4
1 0
8 0
3 1
3 2
8 3
5 4
6 5
8 6
8 7
6 0
2 1
6 2
6 3
5 4
6 5
8 0
2 1
4 2
4 3
8 4
8 5
7 6
8 7
8 0
2 1
3 2
5 3
5 4
6 5
7 6
8 7
3 0
2 1
3 2
1 0
2 1
5 2
5 3
5 4
6 5
1 0
2 1
3 2
4 3
6 4
6 5
5 0
2 1
5 2
5 3
5 4
8 5
8 6
8 7
2 0
2 1
2 0
2 1
1 0
15 1
15 2
6 3
5 4
6 5
7 6
15 7
10 8
10 9
11 10
15 11
15 12
15 13
15 14
2 0
2 1
1 0
3 1
3 2
9 3
5 4
9 5
7 6
8 

True
