# 텍스트 전처리
----
- 패키지 설치
    * NLTK : pip install nltk
    * KoNLPy : pip install konlpy

In [58]:
# NLTK 패키지 설치
!pip install nltk



In [59]:
!pip install konlpy



## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미를 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    * 문장 토큰화
    * 단어 토큰화

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
import nltk

In [4]:
# NLTK Corpus 말뭉치 데이터셋 다운로드 받기
nltk.download('all', quiet=True)

True

In [5]:
raw_text1="hen tokenizing a Unicode string.\
           NLTK tokenizers can produce token-spans.\
           hen tokenizing a Unicode string."
raw_text2="This particular tokenizer requires the Punkt sentence tokenization.\
           which splits text on whitespace and punctuation."

In [6]:
# 단어 단위 토큰화
result1=word_tokenize(raw_text1)

In [7]:
print(result1)

['hen', 'tokenizing', 'a', 'Unicode', 'string', '.', 'NLTK', 'tokenizers', 'can', 'produce', 'token-spans', '.', 'hen', 'tokenizing', 'a', 'Unicode', 'string', '.']


In [8]:
# 문장 단위 토큰화
raw_text= raw_text1+raw_text2

In [9]:
raw_text

'hen tokenizing a Unicode string.           NLTK tokenizers can produce token-spans.           hen tokenizing a Unicode string.This particular tokenizer requires the Punkt sentence tokenization.           which splits text on whitespace and punctuation.'

In [15]:
result=sent_tokenize(raw_text)

In [16]:
print(result, len(result))

['hen tokenizing a Unicode string.', 'NLTK tokenizers can produce token-spans.', 'hen tokenizing a Unicode string.This particular tokenizer requires the Punkt sentence tokenization.', 'which splits text on whitespace and punctuation.'] 4


In [17]:
result

['hen tokenizing a Unicode string.',
 'NLTK tokenizers can produce token-spans.',
 'hen tokenizing a Unicode string.This particular tokenizer requires the Punkt sentence tokenization.',
 'which splits text on whitespace and punctuation.']

### 여러 문장에 토큰 추출
---

In [19]:
# 문장 단위로 추출
for sent in result:
    total_token=set()
    #문장 추출
    sentResult=sent_tokenize(sent)
    
    # 문장에서 추출한 토큰
    print(f'sent => {sentResult}')
    
    for ele in sentResult:
        print(f'ele => {ele}')
        wordResult=word_tokenize(ele)
        print(f'wordResult => {wordResult}')

sent => ['hen tokenizing a Unicode string.']
ele => hen tokenizing a Unicode string.
wordResult => ['hen', 'tokenizing', 'a', 'Unicode', 'string', '.']
sent => ['NLTK tokenizers can produce token-spans.']
ele => NLTK tokenizers can produce token-spans.
wordResult => ['NLTK', 'tokenizers', 'can', 'produce', 'token-spans', '.']
sent => ['hen tokenizing a Unicode string.This particular tokenizer requires the Punkt sentence tokenization.']
ele => hen tokenizing a Unicode string.This particular tokenizer requires the Punkt sentence tokenization.
wordResult => ['hen', 'tokenizing', 'a', 'Unicode', 'string.This', 'particular', 'tokenizer', 'requires', 'the', 'Punkt', 'sentence', 'tokenization', '.']
sent => ['which splits text on whitespace and punctuation.']
ele => which splits text on whitespace and punctuation.
wordResult => ['which', 'splits', 'text', 'on', 'whitespace', 'and', 'punctuation', '.']


#### 한글 
---

In [20]:
from konlpy.tag import Okt

# 행태소 분리 객체
okt=Okt()

In [21]:
# 형태소 분리
result=okt.morphs("오늘은 월요일입니다.")
print(result)

['오늘', '은', '월요일', '입니다', '.']


In [22]:
# 행태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos("오늘은 월요일입니다.")

In [23]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('입니다', 'Adjective'), ('.', 'Punctuation')]


In [24]:
result2=okt.pos("오늘은 월요일입니다.", stem=True)

In [25]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


### [2] 정제 & 정규화
---
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화 
    * 대문자 또는 소문자로 통일
    * 문장의 길이

### [2-1] 불용어 (Stopword)

In [26]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [27]:
len(en_stopwords)

179

In [28]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### [2-2] 어간 및 표제어 처리
---

In [29]:
from nltk.stem import LancasterStemmer

In [30]:
# 어간 추출
lstem=LancasterStemmer()

In [31]:
lstem.stem('working'), lstem.stem('worked'), lstem.stem('worken')

('work', 'work', 'work')

In [32]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [33]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [34]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [35]:
wlemma=WordNetLemmatizer()

In [36]:
wlemma.lemmatize('working', 'v'), wlemma.lemmatize('worked', 'v')

('work', 'work')

In [37]:
wlemma.lemmatize('amusing', 'v'), wlemma.lemmatize('amused', 'v')

('amuse', 'amuse')

### [3] 텍스트 벡터화
---
- 텍스트 => 수치화
- 희소벡터(OHE) : BOW 방식 -->  Count기반, TF-IDF 기반
- 밀집벡터 : Embedding 방식 , Word2Vect

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [39]:
corpus=[raw_text1, raw_text2]

In [40]:
ohe=CountVectorizer()

In [41]:
ohe.fit(corpus)

In [42]:
ret=ohe.transform(corpus)

In [43]:
print(type(ret), ret, sep='\n')

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 6)	1
  (0, 11)	1
  (0, 13)	2
  (0, 17)	1
  (0, 20)	1
  (0, 21)	2
  (0, 22)	2
  (1, 0)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (1, 12)	1
  (1, 14)	1
  (1, 15)	1
  (1, 16)	1
  (1, 18)	1
  (1, 19)	1
  (1, 23)	1
  (1, 24)	1


In [44]:
ret=ret.toarray()

In [45]:
print(ret.shape, ret, sep='\n')

(2, 25)
[[0 1 2 1 0 0 1 0 0 0 0 1 0 2 0 0 0 1 0 0 1 2 2 0 0]
 [1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1]]


In [46]:
## TF-IDF 기반
tfIdf=TfidfVectorizer()

In [47]:
tf_corpus=tfIdf.fit_transform(corpus)

In [48]:
type(tf_corpus)

scipy.sparse._csr.csr_matrix

In [49]:
tf_corpus= tf_corpus.toarray()

In [50]:
print(tf_corpus)

[[0.         0.21320072 0.42640143 0.21320072 0.         0.
  0.21320072 0.         0.         0.         0.         0.21320072
  0.         0.42640143 0.         0.         0.         0.21320072
  0.         0.         0.21320072 0.42640143 0.42640143 0.
  0.        ]
 [0.25819889 0.         0.         0.         0.25819889 0.25819889
  0.         0.25819889 0.25819889 0.25819889 0.25819889 0.
  0.25819889 0.         0.25819889 0.25819889 0.25819889 0.
  0.25819889 0.25819889 0.         0.         0.         0.25819889
  0.25819889]]


## 실습 --------------------------------------------------
---
- 단어 단위 토큰화
- 불용어 제거

In [51]:
#볼용어 추출
from nltk import corpus

In [52]:
en_stopwords=corpus.stopwords.words('english')

In [53]:
texts='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [54]:
wordTokens=word_tokenize(texts)

In [55]:
len(wordTokens), type(wordTokens)

(132, list)

In [56]:
# 불용어 제거
wordTokens2=[]
for word in wordTokens:
    if word not in en_stopwords:
        wordTokens2.append(word)

print(f'wordTokens2 : {len(wordTokens2)}')

wordTokens2 : 85


In [57]:
wordTokens3=[ word for word in wordTokens if word not in en_stopwords ]

print(f'wordTokens3 : {len(wordTokens2)}')

wordTokens3 : 85


## Tokenizer 객체 생성
---

In [58]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [59]:
raw_text='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [60]:
# 토큰으로 나누기
tokens=text_to_word_sequence(raw_text)

In [61]:
print(len(tokens), tokens)

128 ['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', '

In [62]:
print(tokens)

['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', 'in',

### Tokenizer 객체 --------------------------------------------------------------------
- 제공한 문서/문장에 대한 단어사전(voca)
- 단어사전(voca)에 존재하지 않는 단어 => Out Of Voca : oov

In [63]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'
]
# {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, '
#  do': 7, 'think': 8, 'is': 9, 'amazing': 10}

In [64]:
tokenizer = Tokenizer(oov_token='oov', num_words=5)

# 단어 빈도수가 높은 순으로 낮은 정수 인덱스 부여
tokenizer.fit_on_texts(sentences)

In [65]:
# 단어 인덱스  : 단어 인덱스
print(tokenizer.word_index)

{'oov': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [66]:
# 단어 출력 갯수
print(tokenizer.word_counts)

OrderedDict([('i', 2), ('love', 3), ('my', 4), ('dog', 3), ('cat', 1), ('you', 2), ('do', 1), ('think', 1), ('is', 1), ('amazing', 1)])


In [67]:
# 문장을 생성된 사전(voca)를 기반으로 수치화 
print(tokenizer.texts_to_sequences(['We think my dog is amazing?']))

[[1, 1, 2, 4, 1, 1]]


### 패딩(Padding)
---
- 길이가 모두 다른 문장들을 동일 길이로 맞추기 위한 과정
- 길이 기준 설정
- 긴 경우 => 앞/뒤 중 선택
- 짧은 경우 =>  앞/뒤 중 선택
- 값 => 패딩에 들어갈 값    

In [68]:
from tensorflow.keras.utils import pad_sequences

In [69]:
result=tokenizer.texts_to_sequences(sentences)

In [70]:
result

[[1, 3, 2, 4], [1, 3, 2, 1], [1, 3, 2, 4], [1, 1, 1, 2, 4, 1, 1]]

In [71]:
encoding=pad_sequences(result)
encoding

array([[0, 0, 0, 1, 3, 2, 4],
       [0, 0, 0, 1, 3, 2, 1],
       [0, 0, 0, 1, 3, 2, 4],
       [1, 1, 1, 2, 4, 1, 1]])

## One-Hot-Encoding 변환
---
- sklearn OneHotEncoder객체 생성
- kears 함수

In [72]:
from tensorflow.keras.utils import to_categorical

In [73]:
# 문장을 생성된 사전(voca)를 기반으로 수치화 
seq_voca=tokenizer.texts_to_sequences(sentences)
print(f'seq_voca : {len(seq_voca)}')
print(seq_voca)

seq_voca : 4
[[1, 3, 2, 4], [1, 3, 2, 1], [1, 3, 2, 4], [1, 1, 1, 2, 4, 1, 1]]


In [74]:
to_categorical(seq_voca[1])

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [75]:
tokenizer.texts_to_matrix(sentences)

array([[0., 1., 1., 1., 1.],
       [0., 1., 1., 1., 0.],
       [0., 1., 1., 1., 1.],
       [0., 1., 1., 0., 1.]])

In [76]:
tokenizer.texts_to_sequences(sentences)

[[1, 3, 2, 4], [1, 3, 2, 1], [1, 3, 2, 4], [1, 1, 1, 2, 4, 1, 1]]

In [77]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [78]:
encoded = tokenizer.texts_to_sequences(sentences)
print(encoded)

[[1, 3, 2, 4], [1, 3, 2, 1], [1, 3, 2, 4], [1, 1, 1, 2, 4, 1, 1]]


In [79]:
padded = pad_sequences(encoded)
padded

array([[0, 0, 0, 1, 3, 2, 4],
       [0, 0, 0, 1, 3, 2, 1],
       [0, 0, 0, 1, 3, 2, 4],
       [1, 1, 1, 2, 4, 1, 1]])

In [80]:
padded = pad_sequences(encoded, padding='post')
padded

array([[1, 3, 2, 4, 0, 0, 0],
       [1, 3, 2, 1, 0, 0, 0],
       [1, 3, 2, 4, 0, 0, 0],
       [1, 1, 1, 2, 4, 1, 1]])

In [81]:
padded = pad_sequences(encoded, padding='post', maxlen=5)
padded

array([[1, 3, 2, 4, 0],
       [1, 3, 2, 1, 0],
       [1, 3, 2, 4, 0],
       [1, 2, 4, 1, 1]])

## FILE 읽고 벡터화
---

#### [1] 데이터 준비

In [82]:
FILE='../data/example.txt'

In [83]:
with open(FILE, mode='r') as f:
    fileData=f.read()

In [84]:
print(len(fileData), type(fileData))

1534 <class 'str'>


In [85]:
fileData

'The main Henry Ford Museum building houses some of the classrooms for the Henry Ford Academy\n\n\nHenry Ford Academy is the first charter school in the United States to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution. The school is sponsored by the Ford Motor Company, Wayne County Regional Educational Service Agency and The Henry Ford Museum and admits high school students. It is located in Dearborn, Michigan on the campus of the Henry Ford museum. Enrollment is taken from a lottery in the area and totaled 467 in 2010.[1]\nFreshman meet inside the main museum building in glass walled classrooms, while older students use a converted carousel building and Pullman cars on a siding of the Greenfield Village railroad. Classes are expected to include use of the museum artifacts, a tradition of the original Village Schools. When the Museum was established in 1929, it included a school which served grades kindergarten to college/trade

In [86]:
# 문자열 => 문자열 리스트
from nltk import sent_tokenize

data_list=sent_tokenize(fileData)

In [87]:
print(f'data_list => {len(data_list)}')

data_list => 12


In [88]:
data_list

['The main Henry Ford Museum building houses some of the classrooms for the Henry Ford Academy\n\n\nHenry Ford Academy is the first charter school in the United States to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution.',
 'The school is sponsored by the Ford Motor Company, Wayne County Regional Educational Service Agency and The Henry Ford Museum and admits high school students.',
 'It is located in Dearborn, Michigan on the campus of the Henry Ford museum.',
 'Enrollment is taken from a lottery in the area and totaled 467 in 2010.',
 '[1]\nFreshman meet inside the main museum building in glass walled classrooms, while older students use a converted carousel building and Pullman cars on a siding of the Greenfield Village railroad.',
 'Classes are expected to include use of the museum artifacts, a tradition of the original Village Schools.',
 'When the Museum was established in 1929, it included a school which served grades kin

##### [2] 토큰화 객체 생성

In [89]:
fileToken=Tokenizer()
# raw_data용 단어사전 생성
fileToken.fit_on_texts(data_list)

In [90]:
print(f'word_index : { len( fileToken.word_index)}개\n{ fileToken.word_index } ')

word_index : 137개
{'the': 1, 'in': 2, 'ford': 3, 'of': 4, 'henry': 5, 'school': 6, 'a': 7, 'and': 8, 'museum': 9, 'for': 10, 'is': 11, 'building': 12, 'academy': 13, 'to': 14, 'educational': 15, 'main': 16, 'classrooms': 17, 'charter': 18, 'by': 19, 'high': 20, 'students': 21, 'it': 22, 'on': 23, 'use': 24, 'village': 25, 'include': 26, 'original': 27, 'schools': 28, 'design': 29, 'international': 30, 'award': 31, 'facilities': 32, 'houses': 33, 'some': 34, 'first': 35, 'united': 36, 'states': 37, 'be': 38, 'developed': 39, 'jointly': 40, 'global': 41, 'corporation': 42, 'public': 43, 'education': 44, 'major': 45, 'nonprofit': 46, 'cultural': 47, 'institution': 48, 'sponsored': 49, 'motor': 50, 'company': 51, 'wayne': 52, 'county': 53, 'regional': 54, 'service': 55, 'agency': 56, 'admits': 57, 'located': 58, 'dearborn': 59, 'michigan': 60, 'campus': 61, 'enrollment': 62, 'taken': 63, 'from': 64, 'lottery': 65, 'area': 66, 'totaled': 67, '467': 68, '2010': 69, '1': 70, 'freshman': 71, '

##### [3] 문장 수치화 & 벡터화

In [91]:
seqData=fileToken.texts_to_sequences(data_list)

In [92]:
print(seqData[0])

[1, 16, 5, 3, 9, 12, 33, 34, 4, 1, 17, 10, 1, 5, 3, 13, 5, 3, 13, 11, 1, 35, 18, 6, 2, 1, 36, 37, 14, 38, 39, 40, 19, 7, 41, 42, 43, 44, 8, 7, 45, 46, 47, 48]


In [93]:
print(data_list[0])

The main Henry Ford Museum building houses some of the classrooms for the Henry Ford Academy


Henry Ford Academy is the first charter school in the United States to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution.
