In [1]:
import sys,pprint,pydot

In [54]:
class cabocha_tree(object):
    r"""parse the cabocha_tree
    input:
        input_str:    the cabocha f1 result block. starting with * and end with EOS
    output:
        class of pased tree structure with pos taggings
        """

    def __init__(self, input_str_list):
        self.tree_d = dict()

        for line in input_str_list:
            if line.startswith("*"):
                _, constituent_num, father_num, head_func_num, dep_score = line.split(
                    " ")
                self.tree_d[int(constituent_num)] = {
                    "father": father_num,
                    "head/func": head_func_num,
                    "dep_score": dep_score,
                    "words": {},
                    "children": []
                }
                word_num = 0
            elif line.startswith("EOS"):
                pass
            elif line == "":
                pass
            else:
                surface_form, att = line.split("\t")
                pos, pos1, pos2, pos3, hy1, hy2, rt, *rd = att.split(",")

                self.tree_d[int(constituent_num)]["words"][word_num] = {
                    "表層形": surface_form,
                    "品詞": pos,
                    "品詞細分類1": pos1,
                    "品詞細分類2": pos2,
                    "品詞細分類3": pos3,
                    "活用型": hy1,
                    "活用形": hy2,
                    "原形": rt,
                }
                word_num += 1
        for w in self.tree_d:
            _father = int(self.tree_d[w]["father"][:-1])
            if _father != -1:
                self.tree_d[_father]["children"].append(w)

    def print_tree_dict(self):
        pprint.pprint(self.tree_d)
    def print_sentence(self):
        for p in self.tree_d:
            self.print_phrase(p)

    def print_phrase(self, n, with_punctuation=True, root=True):
        output = ""
        if root:
            if with_punctuation:
                for w in self.tree_d[n]["words"]:
                    output += self.tree_d[n]["words"][w]['原形']
            else:
                for w in self.tree_d[n]["words"]:
                    if self.tree_d[n]["words"][w]["品詞"] != "記号":
                        output += self.tree_d[n]["words"][w]['原形']
        else:
            if with_punctuation:
                for w in self.tree_d[n]["words"]:
                    output += self.tree_d[n]["words"][w]['表層形']
            else:
                for w in self.tree_d[n]["words"]:
                    if self.tree_d[n]["words"][w]["品詞"] != "記号":
                        output += self.tree_d[n]["words"][w]['表層形']
        #print(output,end="")
        return output

    def print_knock47(self):
        output = {}
        for p in self.tree_d:
            father_p = int(self.tree_d[p]["father"][:-1])
            if father_p != -1:

                h_f = self.tree_d[p]["head/func"].split("/")

                len_p = len(self.tree_d[p]["words"])

                father_h_f = self.tree_d[father_p]["head/func"].split("/")
                father_p_head = int(father_h_f[0])
                father_p_pos = self.tree_d[father_p]["words"][father_p_head][
                    '品詞細分類1']

                if father_p_pos == "サ変接続":
                    print(p)
                    self.print_sentence()
                    predicate = self.print_phrase(
                        p, with_punctuation=False,
                        root=False) + self.print_phrase(
                            father_p, with_punctuation=False, root=False)
                    print(predicate,end="\t")
                    other_children=self.tree_d[father_p]["children"]
                    other_children=[i for i in other_children if i!=p]
                    print(other_children)
                    particles = []
                    phrases = []
                    
                    for c in other_children:
                        if c:
                            len_c=len(self.tree_d[c]["words"])
                            particles.append(self.tree_d[c]["words"][len_c-1]["表層形"])
                            phrases.append(self.print_phrase(c))
                            
                    print(" ".join(particles)," ".join(phrases),sep="\t",end="\n")
                    


# In[8]:

In [3]:
def gen_sent_cabocha(f_name):
    with open(f_name,"r",encoding="utf-8") as f:
        output=[]
        for line in f:
            if line.startswith("EOS"):
                output.append(line)
                if len(output)>1:
                    yield output
                output=[]
            else:
                output.append(line)
            

In [56]:
for ind,sent_list in enumerate(gen_sent_cabocha("./neko.txt.cabocha")):
    if ind<10:
        print("sentence "+str(ind+1)+":")
        parse_=cabocha_tree(sent_list)
        parse_.print_knock47()
        parse_.print_tree_dict()

sentence 1:
sentence 2:
sentence 3:
sentence 4:
sentence 5:
sentence 6:
sentence 7:
sentence 8:
1
書生というのは話である	[6]
という	食うという
6
食うという話である	[1]
は	書生というのは
sentence 9:
sentence 10:


In [82]:
test_str="""
* 0 5D 0/1 -1.514009
吾輩	名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
* 1 2D 0/1 1.311423
ここ	名詞,代名詞,一般,*,*,*,ここ,ココ,ココ
で	助詞,格助詞,一般,*,*,*,で,デ,デ
* 2 3D 0/1 0.123057
始め	動詞,自立,*,*,一段,連用形,始める,ハジメ,ハジメ
て	助詞,接続助詞,*,*,*,*,て,テ,テ
* 3 4D 0/1 1.440044
人間	名詞,一般,*,*,*,*,人間,ニンゲン,ニンゲン
という	助詞,格助詞,連語,*,*,*,という,トイウ,トユウ
* 4 5D 0/1 -1.514009
もの	名詞,非自立,一般,*,*,*,もの,モノ,モノ
を	助詞,格助詞,一般,*,*,*,を,ヲ,ヲ
* 5 -1D 0/1 0.000000
見	動詞,自立,*,*,一段,連用形,見る,ミ,ミ
た	助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
。	記号,句点,*,*,*,*,。,。,。
EOS
"""

In [35]:
temp=cabocha_tree(test_str.split("\n"))

In [36]:
temp.print_tree_dict()

{0: {'children': [],
     'dep_score': '-1.514009',
     'father': '5D',
     'head/func': '0/1',
     'words': {0: {'原形': '吾輩',
                   '品詞': '名詞',
                   '品詞細分類1': '代名詞',
                   '品詞細分類2': '一般',
                   '品詞細分類3': '*',
                   '活用型': '*',
                   '活用形': '*',
                   '発音': 'ワガハイ',
                   '表層形': '吾輩',
                   '読み': 'ワガハイ'},
               1: {'原形': 'は',
                   '品詞': '助詞',
                   '品詞細分類1': '係助詞',
                   '品詞細分類2': '*',
                   '品詞細分類3': '*',
                   '活用型': '*',
                   '活用形': '*',
                   '発音': 'ワ',
                   '表層形': 'は',
                   '読み': 'ハ'}}},
 1: {'children': [],
     'dep_score': '1.311423',
     'father': '2D',
     'head/func': '0/1',
     'words': {0: {'原形': 'ここ',
                   '品詞': '名詞',
                   '品詞細分類1': '代名詞',
                   '品詞細分類2': '一般',
                   '品詞細

In [57]:
a,b=(1,2,3)

ValueError: too many values to unpack (expected 2)