In [83]:
import CaboCha
import pandas as pd


In [84]:
from pandas import DataFrame
df: DataFrame = pd.read_table('./data/ai.ja/ai.ja.txt', header=None, names=['sentence'])

In [85]:
from CaboCha import Tree
from typing import List

def printTree(tree: Tree):
    print(tree.toString(CaboCha.FORMAT_LATTICE))

def headTrees(trees: List[Tree], from_int: int, to_int: int) -> None:
    for tree in trees[from_int:to_int]:
        printTree(tree)



In [86]:
# parse()の返り値が最新のTreeオブジェクトになっている問題に対する試み(1)
# toString() を呼び出せばポイント参照から実態の参照へと切り替わるのでは？と期待したのだが、失敗。
# def noLazyParse(sentence: str) -> Tree:
#     parsed = c.parse(sentence)
#     parsed.toString(CaboCha.FORMAT_LATTICE)
#     return parsed

# trees = df['sentence'].map(noLazyParse)
# trees.head()

In [87]:
# parse()の返り値が最新のTreeオブジェクトになっている問題に対する試み(2)
# いっそ要素ごとにParserを作ってしまおう、とやってみたが、iPhythonのKernelが落ちる。
# trees = df['sentence'].map(CaboCha.Parser().parse)
# trees.head()

In [88]:
# parse()の返り値が最新のTreeオブジェクトになっている問題に対する試み(3)
# parse()実行直後にプロパティをコピーする。
# 上手く行ったと思っていたが、後々問題があることが分かった。ChunkかTokenのオブジェクトがSwingObjectなのだろうか？

from CaboCha import Tree
class TreeObject:
    def __init__(self, tree: Tree):
        self._str = tree.toString(CaboCha.FORMAT_LATTICE)
        self.sentence = tree.sentence()
        self.chunks = [tree.chunk(i) for i in range(tree.chunk_size())]
        self.tokens = [tree.token(i) for i in range(tree.token_size())]

    def __str__(self) -> str:
        return self._str

    def __repr__(self) -> str:
        return self._str

c = CaboCha.Parser()

trees = df['sentence'].map(lambda s: TreeObject(c.parse(s)))
trees.head()

0    * 0 -1D 1/1 0.000000\n人工\t名詞,一般,*,*,*,*,人工,ジンコ...
1    * 0 17D 1/1 0.388993\n人工\t名詞,一般,*,*,*,*,人工,ジンコ...
2    * 0 1D 6/7 3.194287\n『\t記号,括弧開,*,*,*,*,『,『,『\n...
3    * 0 1D 0/1 1.813378\n人間\t名詞,一般,*,*,*,*,人間,ニンゲン...
4    * 0 1D 1/2 0.906609\nプログラミング\t名詞,サ変接続,*,*,*,*,...
Name: sentence, dtype: object

In [89]:
# 40. 係り受け解析結果の読み込み（形態素）
class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'.format(self.surface, self.base, self.pos, self.pos1)

In [90]:
# 41. 係り受け解析結果の読み込み（文節・係り受け）
class ChunkObject():
    def __init__(self, morphs, dst):
        self.morphs = morphs
        self.dst = dst
        self.srcs = []

    def __str__(self):
        return 'morphs[{}]\tdst[{}]\tsrcs[{}]'.format(self.morphs, self.dst, self.srcs)

In [92]:
def debugCabochaChunk(chunk: CaboCha.Chunk, tokens: List[CaboCha.Token]) -> None:
    print(f"chunk.link: {chunk.link}")
    print(f"chunk.token_pos: {chunk.token_pos}")
    print(f"chunk.token_size: {chunk.token_size}")
    print(f"tokens: {list(map(lambda t: t.surface, tokens[chunk.token_pos:chunk.token_pos + chunk.token_size]))}")

[debugCabochaChunk(chunk, trees[1].tokens) for chunk in trees[1].chunks[0:10]]

chunk.link: 20
chunk.token_pos: 0
chunk.token_size: 2
tokens: ['対談', 'で']
chunk.link: 16
chunk.token_pos: 2
chunk.token_size: 2
tokens: ['須藤', 'は']
chunk.link: 6
chunk.token_pos: 4
chunk.token_size: 3
tokens: ['「', 'これ', 'まで']
chunk.link: 6
chunk.token_pos: 7
chunk.token_size: 1
tokens: ['けっこう']
chunk.link: 6
chunk.token_pos: 8
chunk.token_size: 1
tokens: ['長時間']
chunk.link: 6
chunk.token_pos: 9
chunk.token_size: 2
tokens: ['議論', 'を']
chunk.link: 15
chunk.token_pos: 11
chunk.token_size: 6
tokens: ['行っ', 'て', 'き', 'まし', 'た', '。']
chunk.link: 10
chunk.token_pos: 17
chunk.token_size: 3
tokens: ['おかげ', 'で', '、']
chunk.link: 9
chunk.token_pos: 20
chunk.token_size: 2
tokens: ['意見', 'の']
chunk.link: 10
chunk.token_pos: 22
chunk.token_size: 2
tokens: ['違い', 'は']


[None, None, None, None, None, None, None, None, None, None]

In [55]:
def debugCabochaToken(token: CaboCha.Token) -> None:
    print(f"surface: {token.surface}")
    print(f"pos: {token.normalized_surface}")

debugCabochaToken(trees[1].tokens[0])

surface: 対談
pos: 対談


In [65]:
def cabochaTokenFeatureToMorph(token: CaboCha.Token) -> Morph:
    features = token.feature.split(',')
    try:
        return Morph(token.surface, features[6], features[0], features[1])
    except (IndexError, UnicodeEncodeError) as e:        
        print(f"token.feature: {token.feature}")
        return Morph(token.surface, token.surface, '*', '*')

In [61]:
def cabochaChunkToChunkObject(chunk: CaboCha.Chunk, tokens: List[CaboCha.Token]) -> ChunkObject:
    tokens: List[CaboCha.Token] = tokens[chunk.token_pos:chunk.token_pos + chunk.token_size]
    morphs = list(map(cabochaTokenFeatureToMorph, tokens))
    return ChunkObject(morphs, chunk.link)

cabochaChunkToChunkObject(trees[0].chunks[0], trees[0].tokens)

<__main__.ChunkObject at 0x1323206d0>

In [67]:
chunksArray = [ 
    [cabochaChunkToChunkObject(chunk, tree.tokens) for chunk in tree.chunks] for tree in trees
]

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/hiroga/.ghq/github.com/xhiroga/til/computer-science/nlp/nlp100/.venv/lib/python3.9/site-packages/jupyter_client/session.py", line 97, in json_packer
    return json.dumps(
UnicodeEncodeError: 'utf-8' codec can't encode character '\udc80' in position 138: surrogates not allowed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/hiroga/.ghq/github.com/xhiroga/til/computer-science/nlp/nlp100/.venv/lib/python3.9/site-packages/zmq/eventloop/zmqstream.py", line 431, in _run_callback
    callback(*args, **kwargs)
  File "/Users/hiroga/.ghq/github.com/xhiroga/til/computer-science/nlp/nlp100/.venv/lib/python3.9/site-packages/ipykernel/iostream.py", line 126, in _handle_event
    event_f()
  File "/Users/hiroga/.ghq/github.com/xhiroga/til/computer-science/nlp/nlp100/.venv/lib/python3.9/site-packages/ipykernel/iostrea

In [79]:
def embedSrcsToChunk(index: int, chunk: ChunkObject, chunks: List[ChunkObject]) -> None:
    srcs = [i for i, chunk in enumerate(chunks) if chunk.dst == index]
    chunk.srcs = srcs


for chunks in chunksArray:
    for i, chunk in enumerate(chunks):
        embedSrcsToChunk(i, chunk, chunks)


In [73]:
# 42. 係り元と係り先の文節の表示

def getMorphsConcatinated(chunk: ChunkObject) -> None:
    return ''.join([morph.surface for morph in chunk.morphs])

getMorphsConcatinated(chunksArray[1][0])

'対談で'

In [75]:
tab = "\t"

def printChunkWithSrcs(chunk: ChunkObject, chunks: List[ChunkObject]) -> None:
    print(f"{getMorphsConcatinated(chunk)}{tab}{tab.join([getMorphsConcatinated(chunks[src]) for src in chunk.srcs])}")

printChunkWithSrcs(chunksArray[0][0], chunksArray[0])

対談で	


In [82]:
chunksArrayStr = [(printChunkWithSrcs(chunk, chunks)
                   for chunk in chunks)for chunks in chunksArray]
