# 자연어를 처리할 수 있도록 실제 파싱 객체를 만들기 

In [1]:
from spacy.en import English
parser = English()


In [12]:
parser

<spacy.en.English at 0x1070af5f8>

In [11]:
import spacy

nlp = spacy.load('en')

In [13]:
nlp

<spacy.en.English at 0x10f9ea860>

## 테스트 할 문장


In [None]:
# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

In [2]:
print(multiSentence)

There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.


## Doc 클래스의 인스턴스 만들기 

In [3]:
parsedData = parser(multiSentence)

In [9]:
type(parsedData)

spacy.tokens.doc.Doc

In [14]:
nlp_doc = nlp.tokenizer(multiSentence)

In [15]:
type(nlp_doc)

spacy.tokens.doc.Doc

In [17]:
nlp_nlp = nlp(multiSentence)

In [18]:
type(nlp_nlp)

spacy.tokens.doc.Doc

In [20]:
from spacy.tokens.doc import Doc

doc_doc = Doc(nlp.vocab,multiSentence)

In [21]:
type(doc_doc)

spacy.tokens.doc.Doc

## Doc 클래스 내의 속성과 메소드 알아보기


In [7]:
for i in dir(parsedData) :
    if not i.startswith("_") :
        print(i)

count_by
doc
ents
from_array
from_bytes
has_vector
is_parsed
is_tagged
mem
merge
noun_chunks
noun_chunks_iterator
print_tree
read_bytes
sentiment
sents
similarity
string
tensor
text
text_with_ws
to_array
to_bytes
user_data
user_hooks
user_span_hooks
user_token_hooks
vector
vector_norm
vocab


In [4]:
for i, token in enumerate(parsedData):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break

original: 769 There
lowercased: 608 there
lemma: 608 there
shape: 684 Xxxxx
prefix: 568 T
suffix: 609 ere
log probability: -7.277902603149414
Brown cluster id: 1918
----------------------------------------
original: 513 is
lowercased: 513 is
lemma: 536 be
shape: 505 xx
prefix: 509 i
suffix: 513 is
log probability: -4.3297648429870605
Brown cluster id: 762
----------------------------------------
original: 591 an
lowercased: 591 an
lemma: 591 an
shape: 505 xx
prefix: 506 a
suffix: 591 an
log probability: -5.953293800354004
Brown cluster id: 3
----------------------------------------


## 문장을 확인해 보기 


In [22]:
sents = []
# the "sents" property returns spans
# spans have indices into the original string
# where each index value represents a token
for span in parsedData.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

for sentence in sents:
    print(sentence)

There is an art, it says, or rather, a knack to flying.
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created.
This has made a lot of people very angry and been widely regarded as a bad move.


## 제너레이터를 통해 문장을 하나씩 처리가 가능하다


In [23]:
parsedData.sents

<generator at 0x147324c48>

In [26]:
a = next(parsedData.sents)

## doc 에서 하나의 문장을 찾으면 실제 Span 클래스의 인스턴스가 된다.


In [29]:
print(type(a))
print(a)

<class 'spacy.tokens.span.Span'>
There is an art, it says, or rather, a knack to flying.


In [30]:
for i in dir(a) :
    if not i.startswith("_") :
        print(i)

doc
end
end_char
ent_id
ent_id_
has_vector
label
label_
lefts
lemma_
lower_
merge
noun_chunks
orth_
rights
root
sent
sentiment
similarity
start
start_char
string
subtree
text
text_with_ws
upper_
vector
vector_norm


## 실제 원본 문장을 가진 데이터를 확인


In [31]:
a.doc

There is an art, it says, or rather, a knack to flying.The knack lies in learning how to throw yourself at the ground and miss.In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.

## 하나의 문장을 가져왔으므로 이 시작과 끝에 대한 정보를 가져온다

In [32]:
a.start, a.end

(0, 16)

In [34]:
a.doc[a.start:a.end]

There is an art, it says, or rather, a knack to flying.

## 실제 단어에 대한 출력을 한다. 


In [35]:
for i in range(a.start, a.end) :
    print(a[i].string)

There 
is 
an 
art
, 
it 
says
, 
or 
rather
, 
a 
knack 
to 
flying
.
