# 자연어를 처리할 수 있도록 실제 파싱 객체를 만들기 

In [1]:
from spacy.en import English
parser = English()


In [12]:
parser

<spacy.en.English at 0x1070af5f8>

In [11]:
import spacy

nlp = spacy.load('en')

In [13]:
nlp

<spacy.en.English at 0x10f9ea860>

## 테스트 할 문장


In [None]:
# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

In [2]:
print(multiSentence)

There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.


## Doc 클래스의 인스턴스 만들기 

In [3]:
parsedData = parser(multiSentence)

In [9]:
type(parsedData)

spacy.tokens.doc.Doc

In [14]:
nlp_doc = nlp.tokenizer(multiSentence)

In [15]:
type(nlp_doc)

spacy.tokens.doc.Doc

In [17]:
nlp_nlp = nlp(multiSentence)

In [18]:
type(nlp_nlp)

spacy.tokens.doc.Doc

In [20]:
from spacy.tokens.doc import Doc

doc_doc = Doc(nlp.vocab,multiSentence)

In [21]:
type(doc_doc)

spacy.tokens.doc.Doc

## Doc 클래스 내의 속성과 메소드 알아보기


In [7]:
for i in dir(parsedData) :
    if not i.startswith("_") :
        print(i)

count_by
doc
ents
from_array
from_bytes
has_vector
is_parsed
is_tagged
mem
merge
noun_chunks
noun_chunks_iterator
print_tree
read_bytes
sentiment
sents
similarity
string
tensor
text
text_with_ws
to_array
to_bytes
user_data
user_hooks
user_span_hooks
user_token_hooks
vector
vector_norm
vocab


In [4]:
for i, token in enumerate(parsedData):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break

original: 769 There
lowercased: 608 there
lemma: 608 there
shape: 684 Xxxxx
prefix: 568 T
suffix: 609 ere
log probability: -7.277902603149414
Brown cluster id: 1918
----------------------------------------
original: 513 is
lowercased: 513 is
lemma: 536 be
shape: 505 xx
prefix: 509 i
suffix: 513 is
log probability: -4.3297648429870605
Brown cluster id: 762
----------------------------------------
original: 591 an
lowercased: 591 an
lemma: 591 an
shape: 505 xx
prefix: 506 a
suffix: 591 an
log probability: -5.953293800354004
Brown cluster id: 3
----------------------------------------


## 문장을 확인해 보기 


In [22]:
sents = []
# the "sents" property returns spans
# spans have indices into the original string
# where each index value represents a token
for span in parsedData.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

for sentence in sents:
    print(sentence)

There is an art, it says, or rather, a knack to flying.
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created.
This has made a lot of people very angry and been widely regarded as a bad move.


## 제너레이터를 통해 문장을 하나씩 처리가 가능하다


In [23]:
parsedData.sents

<generator at 0x147324c48>

In [26]:
a = next(parsedData.sents)

## doc 에서 하나의 문장을 찾으면 실제 Span 클래스의 인스턴스가 된다.


In [29]:
print(type(a))
print(a)

<class 'spacy.tokens.span.Span'>
There is an art, it says, or rather, a knack to flying.


## Span  내의 속성과 메소드를 확인한다. 


In [30]:
for i in dir(a) :
    if not i.startswith("_") :
        print(i)

doc
end
end_char
ent_id
ent_id_
has_vector
label
label_
lefts
lemma_
lower_
merge
noun_chunks
orth_
rights
root
sent
sentiment
similarity
start
start_char
string
subtree
text
text_with_ws
upper_
vector
vector_norm


## 실제 원본 문장을 가진 데이터를 확인


In [31]:
a.doc

There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.

## 하나의 문장을 가져왔으므로 이 시작과 끝에 대한 정보를 가져온다

In [32]:
a.start, a.end

(0, 16)

In [34]:
a.doc[a.start:a.end]

There is an art, it says, or rather, a knack to flying.

## 토큰화가 되어있기에 인덱스로 토큰을 볼 수 있다.

    실제 단어에 대한 출력을 한다. 


In [35]:
for i in range(a.start, a.end) :
    print(a[i].string)

There 
is 
an 
art
, 
it 
says
, 
or 
rather
, 
a 
knack 
to 
flying
.


## Doc로 되어있기 때문에 전체가 토근화 되어 있는 것을 알 수 있다. 

In [49]:
for i in range(10) :
    print(parsedData[i])

There
is
an
art
,
it
says
,
or
rather


## 토큰에 필요한 속성들을 조회해서 출력하기 ..



In [50]:
example = "The boy with the spotted dog quickly ran after the firetruck."
parsedEx = parser(example)
# shown as: original token, dependency tag, head word, left dependents, right dependents
for token in parsedEx:
    print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])

The det boy [] []
boy nsubj ran ['The'] ['with']
with prep boy [] []
the det dog [] []
spotted amod dog [] []
dog nsubj ran ['the', 'spotted'] []
quickly advmod ran [] []
ran ROOT ran ['boy', 'dog', 'quickly'] ['after', '.']
after prep ran [] ['firetruck']
the det firetruck [] []
firetruck pobj after ['the'] []
. punct ran [] []


## 토큰을 처리해서 문장을 묶어서 다시 표기하기


In [47]:
sent_ = []
for span in parsedData.sents:
    sent_.append([ parsedData[i] for i in range(span.start, span.end)])
    
print(sent_) 

[[There, is, an, art, ,, it, says, ,, or, rather, ,, a, knack, to, flying, .], [The, knack, lies, in, learning, how, to, throw, yourself, at, the, ground, and, miss, .], [In, the, beginning, the, Universe, was, created, .], [This, has, made, a, lot, of, people, very, angry, and, been, widely, regarded, as, a, bad, move, .]]


## 토큰을 가지고 바로 품사를 붙이기 


In [48]:
sent = []
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

print(sent)
for token in sent:
    print(token.orth_," : ", token.pos_)

[There, is, an, art, ,, it, says, ,, or, rather, ,, a, knack, to, flying, .]
There  :  ADV
is  :  VERB
an  :  DET
art  :  NOUN
,  :  PUNCT
it  :  PRON
says  :  VERB
,  :  PUNCT
or  :  CCONJ
rather  :  ADV
,  :  PUNCT
a  :  DET
knack  :  NOUN
to  :  ADP
flying  :  NOUN
.  :  PUNCT


## 엔티티 에 대한 네임을 확인한다. 


In [55]:
# Let's look at the named entities of this example:
example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsedEx = parser(example)
for token in parsedEx:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

print("-------------- entities only ---------------")


Apple ORG
's (not an entity)
stocks (not an entity)
dropped (not an entity)
dramatically (not an entity)
after (not an entity)
the (not an entity)
death (not an entity)
of (not an entity)
Steve PERSON
Jobs PERSON
in (not an entity)
October DATE
. (not an entity)
-------------- entities only ---------------


In [54]:
# if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
ents = list(parsedEx.ents)
for entity in ents:
    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))

380 ORG Apple
377 PERSON Steve Jobs
387 DATE October


## 토큰화된 단어, 품사, 단어 기본형 

In [56]:
messyData = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
parsedData = parser(messyData)
for token in parsedData:
    print(token.orth_, token.pos_, token.lemma_)

lol NOUN lol
that ADJ that
is VERB be
rly ADV rly
funny ADJ funny
:) PUNCT :)
This DET this
is VERB be
gr8 VERB gr8
i PRON i
rate VERB rate
it PRON -PRON-
8/8 NUM 8/8
! PUNCT !
! PUNCT !
! PUNCT !
