# 어간 추출( Stemming ) and 표제어 추출( Lemmatization )

## 1. 표제어 추출( Lemmatization )

In [1]:
from nltk.stem import WordNetLemmatizer

In [2]:
n = WordNetLemmatizer()
words = [ 'policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting' ]
print( [ n.lemmatize( w ) for w in words ] )

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [3]:
n.lemmatize( 'dies', 'v' )

'die'

In [4]:
n.lemmatize( 'watched', 'v' )

'watch'

In [5]:
n.lemmatize( 'has', 'v' )

'have'

## 2. 어간 추출( Stemming )

### - 어간 추출 알고리즘중 포터 알고리즘( Porter Algorithm )

In [6]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [7]:
s = PorterStemmer()
text = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
words = word_tokenize( text )
print( words )

['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']


In [8]:
print( [ s.stem( w ) for w in words ] )

['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [9]:
words = [ 'formilize', 'allowance', 'electricical' ]
print( [ s.stem( w ) for w in words ] )

['formil', 'allow', 'electric']


### 포터 알고리즘( Porter Algorithm )과 랭커스터 스태머 알고리즘( Lancaster Stemmer Algorithm ) 어간 추출 비교 

In [13]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [14]:
words = [ 'polich', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting' ]

#### 포터 알고리즘

In [15]:
s = PorterStemmer()
print( [ s.stem( w ) for w in words ] )

['polich', 'do', 'organ', 'have', 'go', 'love', 'live', 'fli', 'die', 'watch', 'ha', 'start']


#### 랭카스터 스태머 알고리즘

In [17]:
l = LancasterStemmer()
print( [ l.stem( w ) for w in words ] )

['polich', 'doing', 'org', 'hav', 'going', 'lov', 'liv', 'fly', 'die', 'watch', 'has', 'start']


## 3. 한국어에서의 어간 추출