In [132]:
# 50. 文区切り
import re

sentences = []
with open("./nlp.txt", "r") as f:
    for line in f:
        if line == "\n":
            continue
        lines = re.sub(r"(?P<group1>[.;:?!])( +)(?P<group3>[A-Z])", r"\1\2\n\3",line)
        sentence_list = lines.split("\n")
        sentences += sentence_list[:len(sentence_list)-1]
        
for sentence in sentences[:5]:
    print(sentence)

Natural language processing
From Wikipedia, the free encyclopedia
Natural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human (natural) languages. 
As such, NLP is related to the area of humani-computer interaction. 
Many challenges in NLP involve natural language understanding, that is, enabling computers to derive meaning from human or natural language input, and others involve natural language generation.


In [64]:
# 51. 単語の切り出し
words = []
for sentence in sentences[:2]:
    word_list = sentence.split(" ") + ["\n"]
    words += word_list

for word in words:
    print(word)

Natural
language
processing


From
Wikipedia,
the
free
encyclopedia




In [69]:
# 52. ステミング
# stemming.porterはPython3で動かない
import stemming.porter2 as porter2

for word in words[:2]:
    print(word + "\t" + porter2.stem(word))

Natural	Natur
language	languag


In [93]:
# 53. Tokenization
# XMLファイルの生成
import subprocess

command = "java -cp \"/usr/local/lib/stanford-corenlp-full-2015-12-09/*\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file ./nlp.txt"
subprocess.check_output(command, shell=True)

b''

In [130]:
import xml.etree.ElementTree as ET

tree = ET.parse("./nlp.txt.out")
root = tree.getroot()
for sentence in root[0][0][:3]:
    for token in sentence[0]:
        print(token[0].text)

Natural
language
processing
From
Wikipedia
,
the
free
encyclopedia
Natural
language
processing
-LRB-
NLP
-RRB-
is
a
field
of
computer
science
,
artificial
intelligence
,
and
linguistics
concerned
with
the
interactions
between
computers
and
human
-LRB-
natural
-RRB-
languages
.
As
such
,
NLP
is
related
to
the
area
of
humani-computer
interaction
.
Many
challenges
in
NLP
involve
natural
language
understanding
,
that
is
,
enabling
computers
to
derive
meaning
from
human
or
natural
language
input
,
and
others
involve
natural
language
generation
.


In [131]:
# 54. 品詞タグ付け
for sentence in root[0][0][:3]:
    for token in sentence[0]:
        print("{}\t{}\t{}".format(token[0].text, token[1].text, token[4].text))


Natural	natural	JJ
language	language	NN
processing	processing	NN
From	from	IN
Wikipedia	Wikipedia	NNP
,	,	,
the	the	DT
free	free	JJ
encyclopedia	encyclopedia	NN
Natural	natural	JJ
language	language	NN
processing	processing	NN
-LRB-	-lrb-	-LRB-
NLP	nlp	NN
-RRB-	-rrb-	-RRB-
is	be	VBZ
a	a	DT
field	field	NN
of	of	IN
computer	computer	NN
science	science	NN
,	,	,
artificial	artificial	JJ
intelligence	intelligence	NN
,	,	,
and	and	CC
linguistics	linguistics	NNS
concerned	concern	VBN
with	with	IN
the	the	DT
interactions	interaction	NNS
between	between	IN
computers	computer	NNS
and	and	CC
human	human	JJ
-LRB-	-lrb-	-LRB-
natural	natural	JJ
-RRB-	-rrb-	-RRB-
languages	language	NNS
.	.	.
As	as	IN
such	such	JJ
,	,	,
NLP	nlp	NN
is	be	VBZ
related	relate	VBN
to	to	TO
the	the	DT
area	area	NN
of	of	IN
humani-computer	humani-computer	JJ
interaction	interaction	NN
.	.	.
Many	many	JJ
challenges	challenge	NNS
in	in	IN
NLP	nlp	NN
involve	involve	VBP
natural	natural	JJ
language	language	NN
understanding	unde

In [119]:
# 55. 固有表現抽出
for sentence in root[0][0]:
    for token in sentence[0]:
        if token[5].text == "PERSON":
            print(token[0].text)

Alan
Turing
Joseph
Weizenbaum
MARGIE
Schank
Wilensky
Meehan
Lehnert
Carbonell
Lehnert
Jabberwacky
Moore


In [118]:
# 56. 共参照解析

class Mention(object):
    def __init__(self, sentence, start, end, head, text, representative):
        self.sentence = sentence
        self.start = start
        self.end = end
        self.head = head
        self.text = text
        self.representative = representative

# {文番号: [Mention]}という辞書を作成する
# 文の番号から置換する対象を引いてくることができる
coreferences_dict = {}
for coreference in root[0][1]:
    represent = coreference.find('mention[@representative="true"]')
    for mention in coreference[:5]:
        if "representative" in mention.attrib:
            continue
        mention_obj = Mention(mention[0].text, mention[1].text, mention[2].text, mention[3].text, mention[4].text, represent.find("text").text)
        if not mention_obj.sentence in coreferences_dict:
            coreferences_dict[mention_obj.sentence] = [mention_obj]
        else:
            coreferences_dict[mention_obj.sentence].append(mention_obj)
            
for idx,sentence in enumerate(root[0][0][:5]):
    # coreference_dictの文番号に値が格納されている場合、その文に置換対象が存在する
    if str(idx+1) in coreferences_dict:
        mentions = coreferences_dict[str(idx+1)]
        words = []
        for token in sentence[0]:
            words.append(token[0].text)
        sent = " ".join(words)
        for mention in mentions:
            sent = sent.replace(mention.text, mention.representative + "(" + mention.text + ")")
        print(sent)
    else:
        words = []
        for token in sentence[0]:
            words.append(token[0].text)
        print(" ".join(words))

Natural language processing From Wikipedia , the free encyclopedia Natural language processing -LRB- NLP -RRB- is the free encyclopedia Natural language processing -LRB- NLP -RRB-(a field of computer science) , artificial intelligence , and linguistics concerned with the interactions between computers and human -LRB- natural -RRB- languages .
As such , NLP is related to the area of humani-computer interaction .
Many challenges in NLP involve natural language understanding , that is , enabling computers(computers) to derive meaning from human or natural language input , and others involve natural language generation .
History The history of NLP generally starts in the 1950s , although work can be found from earlier periods .
In 1950 , Alan Alan Turing(Turing) published an article titled `` Computing Machinery and Intelligence '' which proposed what is now called the Alan Turing(Turing) test as a criterion of intelligence .


In [127]:
# 57. 係り受け解析
import pydot

for idx, sentence in enumerate(root[0][0][:3]):
    collapsed_dependencies = sentence.findall("dependencies[@type=\"collapsed-dependencies\"]")
    # print(collapsed_dependencies[0][0])
    nodes = ['"{}" -> "{}" ;'.format(dep[1].text, dep[0].text) for dep in collapsed_dependencies[0]]
    sentence_dot = "digraph sentence{" + "".join(nodes) + "}"
    g = pydot.graph_from_dot_data(sentence_dot)
    g[0].write_jpeg("./jpg/sentence{}.jpg".format(idx))

In [129]:
# 58. タプルの抽出

for idx, sentence in enumerate(root[0][0][:3]):
    collapsed_dependencies = sentence.findall("dependencies[@type=\"collapsed-dependencies\"]")
    nsubj_list = [dep for dep in collapsed_dependencies[0] if dep.attrib["type"] == "nsubj"]
    dobj_list = [dep for dep in collapsed_dependencies[0] if dep.attrib["type"] == "dobj"]
    if len(nsubj_list) == 0 or len(dobj_list) == 0:
        continue
    for nsubj in nsubj_list:
        for dobj in dobj_list:
            if nsubj[0].attrib["idx"] == dobj[0].attrib["idx"]:
                print("{}\t{}\t{}".format(nsubj[1].text, nsubj[0].text, dobj[1].text))

understanding	enabling	computers
others	involve	generation


In [None]:
# 59. S式の解析
