2.1 말뭉치, 토큰, 타입

In [1]:
# !pip install --upgrade spacy
# !python -m spacy download en_core_web_sm
# !pip install --upgrade tensorflow

In [2]:
# 텍스트 토큰화
import spacy

nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch."

print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [3]:
from nltk.tokenize import TweetTokenizer

tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"

tokenizer = TweetTokenizer()

print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


2.2 유니그램, 바이그램, 트라이그램, ∙∙∙, n-그램

In [4]:
# 텍스트에서 n-그램 만들기
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
n_grams(cleaned, 3)

[['mary', ',', "n't"],
 [',', "n't", 'slap'],
 ["n't", 'slap', 'green'],
 ['slap', 'green', 'witch'],
 ['green', 'witch', '.']]

2.3 표제어와 어간

In [5]:
# 표제어 추출: 단어를 표제어로 바꿉니다.
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"he was running late")

for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


In [6]:
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')
  
words = ['cared','university','fairly','easily','singing','sings','sung','singer','sportingly']
  
stem_words = []

for w in words:
    x = snow_stemmer.stem(w)
    stem_words.append(x)
      
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

cared ----> care
university ----> univers
fairly ----> fair
easily ----> easili
singing ----> sing
sings ----> sing
sung ----> sung
singer ----> singer
sportingly ----> sport


In [8]:
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
  
words = ['cared','university','fairly','easily','singing','sings','sung','singer','sportingly']
  
stem_words = []

for w in words:
    x = porter_stemmer.stem(w)
    stem_words.append(x)
      
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

cared ----> care
university ----> univers
fairly ----> fairli
easily ----> easili
singing ----> sing
sings ----> sing
sung ----> sung
singer ----> singer
sportingly ----> sportingli


2.5 단어 분류하기: 품사 태깅

In [9]:
# 품사 태깅
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp(u"Mary slapped the green witch.")

for token in doc:
    print('{} --> {}'.format(token, token.pos_))

Mary --> PROPN
slapped --> VERB
the --> DET
green --> ADJ
witch --> NOUN
. --> PUNCT


2.6 청크 나누기와 개체명 인식

In [10]:
# 명사구(NP) 부문 구문 분석
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp(u"Mary slapped the green witch.")

for chunk in doc.noun_chunks:
    print('{} --> {}'.format(chunk, chunk.label_))

Mary --> NP
the green witch --> NP
