# English Text Preprocessing

## Stemming

In [1]:
# nltk : 영어를 처리할 수 있는 패키지

from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

In [2]:
print(stemmer.stem("working"), stemmer.stem("works"), stemmer.stem("worked"))
print(stemmer.stem("happier"), stemmer.stem("happiest"))
print(stemmer.stem("books"))

work work work
happy happiest
book


Stemmer의 단점 : 기계적으로 원형을 찾아서 세상에 없는 단어가 만들어 질 때 있음.

In [3]:
print(stemmer.stem("this"))

thi


In [4]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [5]:
print(stemmer.stem("working"), stemmer.stem("works"), stemmer.stem("worked"))
print(stemmer.stem("happier"), stemmer.stem("happiest"))
print(stemmer.stem("books"))
print(stemmer.stem("this"))

work work work
happier happiest
book
thi


# Lemmatization

In [6]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

"n" : 명사
"v" : 동사
"a" : 형용사
"r" : 부사
"s" : 위성 형용사

In [8]:
print( lemmatizer.lemmatize("working", "v"), lemmatizer.lemmatize("works", "v"), lemmatizer.lemmatize("worked", "v") )
print( lemmatizer.lemmatize("am", "v"), lemmatizer.lemmatize("is", "v"), lemmatizer.lemmatize("are", "v"))

print( lemmatizer.lemmatize("dance", "n"), lemmatizer.lemmatize("this", "n"))

work work work
be be be
dance this


# 정규 표현식(Regular Expressions)

In [9]:
eng_text_sample = """



I'm at a payphone, trying to call home.

All of my change I spent on you.

Where have the times gone?.

Baby, it's all wrong.

Where are the plans we made for two?



"""

eng_text_sample

"\n\n\n\nI'm at a payphone, trying to call home.\n\nAll of my change I spent on you.\n\nWhere have the times gone?.\n\nBaby, it's all wrong.\n\nWhere are the plans we made for two?\n\n\n\n"

In [10]:
import re

# 정규식으로 개행문자 제거 re.sub : replace 효과
eng_sent_re = re.sub("\n", " ", eng_text_sample)
eng_sent_re

"    I'm at a payphone, trying to call home.  All of my change I spent on you.  Where have the times gone?.  Baby, it's all wrong.  Where are the plans we made for two?    "

In [11]:
# 정규식으로 2번 이상 공백이 있을 시 한번으로 치환
# 만약 {2,4}로 한다면 2 이상 4 이하의 횟수로 제한
eng_sent_re = re.sub("\s{2,}", " ", eng_sent_re)
eng_sent_re

" I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone?. Baby, it's all wrong. Where are the plans we made for two? "

In [12]:
# 물음표 또는 느낌표를 마침표로 치환
eng_sent_re = re.sub("[!|?]", ".", eng_sent_re)
eng_sent_re

" I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two. "

# spaCy 활용
- 서구권 언어들에 대한 토큰화, 어간 찾기 등을 손쉽게 수행해 주는 라이브러리

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(eng_sent_re.strip())

doc.text

"I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two."

In [14]:
# 단어 토큰화 확인 및 형태소(POS), Lemmatization 실행
for word_token in doc:
  print("word : {}\t\tPOS : {}\t\tLemma : {}".format(word_token.text, word_token.pos_, word_token.lemma_))

word : I		POS : PRON		Lemma : I
word : 'm		POS : AUX		Lemma : be
word : at		POS : ADP		Lemma : at
word : a		POS : DET		Lemma : a
word : payphone		POS : NOUN		Lemma : payphone
word : ,		POS : PUNCT		Lemma : ,
word : trying		POS : VERB		Lemma : try
word : to		POS : PART		Lemma : to
word : call		POS : VERB		Lemma : call
word : home		POS : NOUN		Lemma : home
word : .		POS : PUNCT		Lemma : .
word : All		POS : PRON		Lemma : all
word : of		POS : ADP		Lemma : of
word : my		POS : PRON		Lemma : my
word : change		POS : NOUN		Lemma : change
word : I		POS : PRON		Lemma : I
word : spent		POS : VERB		Lemma : spend
word : on		POS : ADP		Lemma : on
word : you		POS : PRON		Lemma : you
word : .		POS : PUNCT		Lemma : .
word : Where		POS : SCONJ		Lemma : where
word : have		POS : AUX		Lemma : have
word : the		POS : DET		Lemma : the
word : times		POS : NOUN		Lemma : time
word : gone		POS : VERB		Lemma : go
word : ..		POS : PUNCT		Lemma : ..
word : Baby		POS : PROPN		Lemma : Baby
word : ,		POS : PUNCT		Lemma 

한국어의 경우 패키지 에러로 인해 이후 따로 할 예정