In [1]:
import spacy

In [6]:
# 한국어 처리는 ko_core_news_sm 를 사용
nlp = spacy.load('en_core_web_sm') # 영어 관련된 문자를 처리하는 객체

# 문서 생성
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# 토큰 출력
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [7]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x15bd873dd90>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x15bd873d970>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15bdae5ba00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15bd7c69410>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15bd861da50>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15bdae5b990>)]

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

토큰화

In [9]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE dep
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [10]:
doc2

Tesla isn't   looking into startups anymore.

In [11]:
doc2[0]

Tesla

In [12]:
type(doc2)

spacy.tokens.doc.Doc

part of speech tagging (pos)

In [13]:
doc2[0].pos_

'PROPN'

Dependencies

In [14]:
doc2[0].dep_

'nsubj'

In [15]:
spacy.explain('PROPN')

'proper noun'

In [16]:
spacy.explain('nsubj')

'nominal subject'

Additional Token Attributes

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [17]:
# lemma는 단어의 기본 형태를 나타냄
print(doc2[4].text)
print(doc2[4].lemma_)

looking
look


In [18]:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

VERB
VBG / verb, gerund or present participle


In [19]:
# 단어 형태
print(doc2[0].text + ': ' + doc2[0].shape_)
print(doc[5].text + ': ' + doc[5].shape_)

Tesla: Xxxxx
U.S.: X.X.


In [20]:
# doc2[0] = 'Tesla'
print(doc2[0].is_alpha) # 알파벳 인가?
print(doc2[0].is_stop) # 해당 문장의 끝인가?

True
False


span

In [21]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [22]:
# 인덱싱으로 문장 일부를 떼어낼 수 있다
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [23]:
type(life_quote)

spacy.tokens.span.Span

sentences

In [24]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [25]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
# doc4[6] = 'This' 
doc4[6].is_sent_start # 해당 토큰이 문장의 시작인가

True