# 第5章: 係り受け解析

In [1]:
import re
import CaboCha

In [2]:
cabocha = CaboCha.Parser()
space_pattern = re.compile(r'\u3000')
chapter_pattern = re.compile(r'([\n|\r\n|\r]|(?<=^))([一二三四五六七八九十]+)[\n|\r\n|\r]')

with open("../data/neko.txt") as fr,\
     open("../data/neko.txt.cabocha", "w") as fw:
    text = fr.read()
    text = re.sub(space_pattern, '', text)
    text = re.sub(chapter_pattern, '', text)
    
    for sentence in text.split('。'):
        sentence = sentence + '。'
        tree = cabocha.parse(sentence)
        print(tree.toString(CaboCha.FORMAT_LATTICE), file=fw)

## 40. 係り受け解析結果の読み込み（形態素）

In [3]:
class Morph(object):
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1
        self.info = {
            'surfce': self.surface,
            'base': self.base,
            'pos': self.pos,
            'pos1': self.pos1,
        }
    
    def as_dict(self):
        return self.info
    
    def __str__(self):
        return str(self.info)
    
    def __repr__(self):
        return str(self.info)

In [4]:
neko_cabocha = []
with open('../data/neko.txt.cabocha') as fr:
    sentence_cabocha = []
    for line in fr:
        line = line.rstrip()
        
        if line == 'EOS':
            neko_cabocha.append(sentence_cabocha)
            sentence_cabocha = []
            continue
        
        if len(line) == 0\
        or line[0] == '*':
            continue
        
        m = re.split('[\t,]', line)
        m = Morph(m[0], m[-3], m[1], m[2])
        sentence_cabocha.append(m)

In [5]:
neko_cabocha[2]

[{'surfce': 'どこ', 'base': 'どこ', 'pos': '名詞', 'pos1': '代名詞'},
 {'surfce': 'で', 'base': 'で', 'pos': '助詞', 'pos1': '格助詞'},
 {'surfce': '生れ', 'base': '生れる', 'pos': '動詞', 'pos1': '自立'},
 {'surfce': 'た', 'base': 'た', 'pos': '助動詞', 'pos1': '*'},
 {'surfce': 'か', 'base': 'か', 'pos': '助詞', 'pos1': '副助詞／並立助詞／終助詞'},
 {'surfce': 'とんと', 'base': 'とんと', 'pos': '副詞', 'pos1': '一般'},
 {'surfce': '見当', 'base': '見当', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surfce': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surfce': 'つか', 'base': 'つく', 'pos': '動詞', 'pos1': '自立'},
 {'surfce': 'ぬ', 'base': 'ぬ', 'pos': '助動詞', 'pos1': '*'},
 {'surfce': '。', 'base': '。', 'pos': '記号', 'pos1': '句点'}]

## 41. 係り受け解析結果の読み込み（文節・係り受け）

In [7]:
class Chunk(object):
    def __init__(self, morphs, dst, srcs):
        self.morphs = morphs
        self.dst = dst
        self.srcs = srcs
        self.info = {
            'morphs': morphs,
            'dst': dst,
            'srcs': srcs,
        }
        
    def as_dict(self):
        return self.info
    
    def __str__(self):
        return str(self.info)
    
    def __repr__(self):
        return str(self.info)