In [25]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [26]:
with open("nlp.txt", 'r') as f:
    nlp = " ".join(f.read().split('\n'))

# 50. 文区切り
(. or ; or : or ? or !) → 空白文字 → 英大文字というパターンを文の区切りと見なし，入力された文書を1行1文の形式で出力せよ．

In [27]:
p = '[.|;|:|?|!]\s+([A-Z])'
sentences = []
split_line = re.split(p, nlp)
sentences.append(split_line[0])
for i in range(1, len(split_line), 2):
    sentences.append(split_line[i] + split_line[i+1])

In [28]:
with open('50.txt', 'w') as f:
    for s in sentences:
        f.write(s) 
        f.write("\n") 

# 51. 単語の切り出し
空白を単語の区切りとみなし，50の出力を入力として受け取り，1行1単語の形式で出力せよ．ただし，文の終端では空行を出力せよ．

In [29]:
with open('51.txt', 'w') as f:
    for s in sentences:
        for w in s.split(' '):
            if w: 
                f.write(w)
                f.write('\n')
        f.write('\n')

# 52. ステミング

51の出力を入力として受け取り，Porterのステミングアルゴリズムを適用し，単語と語幹をタブ区切り形式で出力せよ． Pythonでは，Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい．

In [30]:
from stemming.porter2 import stem
with open('51.txt', 'r') as f:
    words = f.readlines()
with open('52.txt', 'w') as f:
    for w in words:
        word = w[:-1]
        line = word + "\t" + stem(word) + "\n"
        f.write(line)

# 53. Tokenization
Stanford Core NLPを用い，入力テキストの解析結果をXML形式で得よ．また，このXMLファイルを読み込み，入力テキストを1行1単語の形式で出力せよ．

In [31]:
# java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file nlp.txt
import xml.etree.ElementTree as ET
tree = ET.parse('nlp.txt.xml')
root = tree.getroot()

In [32]:
with open('53.txt', 'w') as f:
    for child in root.iter('word'):
        f.write(child.text + "\n")

# 54. 品詞タグ付け
Stanford Core NLPの解析結果XMLを読み込み，単語，レンマ，品詞をタブ区切り形式で出力せよ．

In [33]:
# ['word', 'lemma', 'CharacterOffsetBegin', 'CharacterOffsetEnd', 'POS', 'NER', 'Speaker']
with open('54.txt', 'w') as f:
    for child in root.iter('token'):
        word = child.find('word').text
        lemma = child.find('lemma').text
        pos = child.find('POS').text
        line = word + "\t" + lemma +"\t" + pos
        f.write(line + "\n")

# 55. 固有表現抽出
入力文中の人名をすべて抜き出せ．

In [34]:
with open('55.txt', 'w') as f:
    for child in root.iter('token'):
        if child.find('NER').text == "PERSON":
            word = child.find('word').text
            f.write(word + "\n")

# 56. 共参照解析
Stanford Core NLPの共参照解析の結果に基づき，文中の参照表現（mention）を代表参照表現（representative mention）に置換せよ．ただし，置換するときは，「代表参照表現（参照表現）」のように，元の参照表現が分かるように配慮せよ．

In [35]:
corefereneces =[[m for m in child.iter('mention')] for child in root.iter('coreference')]
sentences = [[x.find('word').text for x in child.iter('token')] for child in root.find('document/sentences').iter('sentence')]
for c in corefereneces:
    representative_mention  = c[0]
    representative_text = representative_mention.find("text").text
    for mention in c[1:]:
        sentence = int(mention.find("sentence").text) - 1
        start = int(mention.find("start").text) - 1
        end = int(mention.find("end").text) - 1
        text = representative_text + "(" + mention.find("text").text + ")"
        sentences[sentence][start] = text
        sentences[sentence][start+1:end] = ["" for x in range(end - start - 1)]
with open('56.txt', 'w') as f:
    f.write("\n".join([" ".join([x for x in sentence if x != ""]) for sentence in sentences]))

# 57. 係り受け解析
Stanford Core NLPの係り受け解析の結果（collapsed-dependencies）を有向グラフとして可視化せよ．可視化には，係り受け木をDOT言語に変換し，Graphvizを用いるとよい．また，Pythonから有向グラフを直接的に可視化するには，pydotを使うとよい．

In [36]:
import pydot
edges = []
collapsed_dependencies = \
    [sentence.find('dependencies[@type="collapsed-dependencies"]') for sentence in root.find('document/sentences').iter('sentence')]
# for i in range(len(collapsed_dependencies)):
# 全部やると時間かかるので一部のみ
for i in range(10):
    for dep in collapsed_dependencies[i].iter('dep'):
        governor = str(i) + " " + dep.find('governor').attrib['idx'] + " " + dep.find('governor').text
        dependent = str(i) + " " + dep.find('dependent').attrib['idx'] + " " + dep.find('dependent').text
        edges.append((dependent, governor))
g=pydot.graph_from_edges(edges, directed=True)
g.write_jpeg('57.jpg')

True

# 58. タプルの抽出
Stanford Core NLPの係り受け解析の結果（collapsed-dependencies）に基づき，「主語 述語 目的語」の組をタブ区切り形式で出力せよ．ただし，主語，述語，目的語の定義は以下を参考にせよ．

+ 述語: nsubj関係とdobj関係の子（dependant）を持つ単語
+ 主語: 述語からnsubj関係にある子（dependent）
+ 目的語: 述語からdobj関係にある子（dependent）

In [37]:
with open('58.txt', 'w') as f:
    for collapsed_dependencie in collapsed_dependencies:
        for dep in collapsed_dependencie.iter('dep'):
            type = dep.attrib['type']
            if type == 'nsubj':
                predicate = dep.find('governor').text
                predicate_id = dep.find('governor').attrib['idx']
                dependants = [x for x in collapsed_dependencie.findall('dep') if x.find('governor').attrib['idx'] == str(predicate_id)]
                subject = [d.find('dependent').text for d in dependants if d.attrib['type'] == 'nsubj']
                object = [d.find('dependent').text for d in dependants if d.attrib['type'] == 'dobj']
                if bool(subject) and bool(object):
                    line = subject[0] + "\t" + predicate +"\t" + object[0]
                    f.write(line + "\n")

# 59. S式の解析
Stanford Core NLPの句構造解析の結果（S式）を読み込み，文中のすべての名詞句（NP）を表示せよ．入れ子になっている名詞句もすべて表示すること．

In [38]:
parses =[x.text for x in root.iter('parse')]
with open('59.txt', 'w') as f:
    for parse in parses:
        parse_list = parse.split(" ")
        np_ids =[i for i in range(len(parse_list)) if parse_list[i] == "(NP"]
        for i in np_ids:
            left, right, np = 0, 0, []
            for w in parse_list[i + 1:]:
                if "(" in w:
                    left +=1 
                if ")" in w:
                    right += len([x for x in list(w) if x == ')'])
                np.append(w)
                if left < right:
                    break
            line = " ".join([x.replace(')', '') for x in np if ')' in x])
            f.write(line + "\n")