# 텍스트 전처리
---
- 패키지 설치
    * NLTK : pip install nltk
    * KoNLPy : pip install konlpy

In [1]:
#!pip install konlpy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
#!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미를 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    * 문장 토큰화
    * 단어 토큰화

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

In [4]:
raw_text1='If called with no arguments, download() will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided.'
raw_text2='Individual packages can be downloaded by calling the download() function with a single argument, giving the package identifier for the package that should be downloaded:'

In [20]:
raw_text_long="If called with no arguments, download() will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided. Individual packages can be downloadedby calling the download() function with a single argument, giving the package identifier for the package that should be downloaded:"

In [6]:
# 단어 단위 토큰화
result1=word_tokenize(raw_text1)
print(result1)

['If', 'called', 'with', 'no', 'arguments', ',', 'download', '(', ')', 'will', 'display', 'an', 'interactive', 'interface', 'which', 'can', 'be', 'used', 'to', 'download', 'and', 'install', 'new', 'packages', '.', 'If', 'Tkinter', 'is', 'available', ',', 'then', 'a', 'graphical', 'interface', 'will', 'be', 'shown', ',', 'otherwise', 'a', 'simple', 'text', 'interface', 'will', 'be', 'provided', '.']


In [7]:
# 문장 단위 토큰화
raw_text=[raw_text1, raw_text2]

In [8]:
raw_text

['If called with no arguments, download() will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided.',
 'Individual packages can be downloaded by calling the download() function with a single argument, giving the package identifier for the package that should be downloaded:']

In [21]:
sent_result=sent_tokenize(raw_text_long)

In [22]:
sent_result

['If called with no arguments, download() will display an interactive interface which can be used to download and install new packages.',
 'If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided.',
 'Individual packages can be downloadedby calling the download() function with a single argument, giving the package identifier for the package that should be downloaded:']

### 여러 문장의 토큰 추출
---

In [28]:
# 문장 단위로 추출
for sent in raw_text:
    total_token=[]
    # 문장 추출
    sentResult=sent_tokenize(sent)
    
    # 문장 단위로 추출
    print(f'sent => {sent}')
    
    for ele in sentResult:
        wordResult=word_tokenize(ele)
        
    sentToken=word_tokenize(sent)
    print(sentToken, '-----', sep='\n')
    print(f'sentToken => {sentToken}')
    
    # 모든 문장의 토큰에 추가
    total_token.append(raw_text2)
    


sent => If called with no arguments, download() will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided.
['If', 'called', 'with', 'no', 'arguments', ',', 'download', '(', ')', 'will', 'display', 'an', 'interactive', 'interface', 'which', 'can', 'be', 'used', 'to', 'download', 'and', 'install', 'new', 'packages', '.', 'If', 'Tkinter', 'is', 'available', ',', 'then', 'a', 'graphical', 'interface', 'will', 'be', 'shown', ',', 'otherwise', 'a', 'simple', 'text', 'interface', 'will', 'be', 'provided', '.']
-----
sentToken => ['If', 'called', 'with', 'no', 'arguments', ',', 'download', '(', ')', 'will', 'display', 'an', 'interactive', 'interface', 'which', 'can', 'be', 'used', 'to', 'download', 'and', 'install', 'new', 'packages', '.', 'If', 'Tkinter', 'is', 'available', ',', 'then', 'a', 'graphical', 'interface', 'will', 'be', 'shown', '

### 한글
---

In [2]:
from konlpy.tag import Okt

# 형태소 분리 객체
okt=Okt()

In [31]:
# 형태소 분리
result=okt.morphs('오늘은 월요일입니다.')
print(result)

['오늘', '은', '월요일', '입니다', '.']


In [32]:
# 형태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos('오늘은 월요일입니다.')
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('입니다', 'Adjective'), ('.', 'Punctuation')]


In [34]:
# 어간을 가져옴
result3=okt.pos('오늘은 월요일입니다.', stem=True)
print(result3)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


In [37]:
# 불용어 패키지 다운로드
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

### [2] 정제 & 정규화
---
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화
    * 대문자 또는 소문자로 통일
    * 문장의 길이

#### [2-1] 불용어(Stopword)

In [39]:
en_stopwords=nltk.corpus.stopwords.words('english')
print(len(en_stopwords))

179


In [40]:
print(en_stopwords[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


#### [2-2] 어간 및 표제어 처리
---

In [3]:
from nltk.stem import LancasterStemmer
# 얘는 결과가 사전에 없을 수도 있음
# ex) amusing => amus

In [43]:
# 어간 추출
lstem=LancasterStemmer()

In [44]:
lstem.stem('working'), lstem.stem('worked'), lstem.stem('worken')

('work', 'work', 'work')

In [46]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [47]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [4]:
# 표제어(사전에 등록된 단어 추출)   # ex) amusing => amuse
from nltk.stem import WordNetLemmatizer

In [51]:
wlemma=WordNetLemmatizer()

In [5]:
import nltk
#nltk.download('wordnet')

In [156]:
wlemma.lemmatize('working', 'v'), wlemma.lemmatize('worked', 'v')

('work', 'work')

In [155]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


True

### [3] 텍스트 벡터화
---
- 텍스트 => 수치화
- 희소벡터(OneHotEncoding) : BOW 방식 --> Count 기반, TF-IDF 기반
- 밀집벡터 : Embedding 방식. Word2Vect

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [57]:
corpus=[raw_text1, raw_text2]

In [58]:
ohe=CountVectorizer()
ohe.fit(corpus)

CountVectorizer()

In [59]:
result=ohe.transform(corpus)

In [61]:
print(type(result), result, sep='\n')

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 0)	1
  (0, 1)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	3
  (0, 7)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	2
  (0, 16)	1
  (0, 18)	2
  (0, 20)	1
  (0, 21)	1
  (0, 22)	3
  (0, 23)	1
  (0, 24)	1
  (0, 25)	1
  (0, 26)	1
  (0, 28)	1
  (0, 29)	1
  (0, 31)	1
  (0, 32)	1
  (0, 34)	1
  (0, 37)	1
  (0, 38)	1
  (0, 39)	1
  (0, 40)	1
  (0, 41)	1
  (0, 42)	3
  (0, 43)	1
  (1, 2)	1
  (1, 5)	2
  (1, 6)	1
  (1, 8)	1
  (1, 9)	1
  (1, 11)	1
  (1, 12)	2
  (1, 13)	1
  (1, 14)	1
  (1, 15)	1
  (1, 17)	1
  (1, 19)	1
  (1, 27)	2
  (1, 28)	1
  (1, 30)	1
  (1, 33)	1
  (1, 35)	1
  (1, 36)	3
  (1, 43)	1


In [62]:
ret=result.toarray()
print(ret)

[[1 1 0 1 1 3 0 1 0 1 1 2 0 0 0 0 1 0 2 0 1 1 3 1 1 1 1 0 1 1 0 1 1 0 1 0
  0 1 1 1 1 1 3 1]
 [0 0 1 0 0 2 1 0 1 1 0 1 2 1 1 1 0 1 0 1 0 0 0 0 0 0 0 2 1 0 1 0 0 1 0 1
  3 0 0 0 0 0 0 1]]


In [64]:
print(ret.shape, ret, sep='\n')

(2, 44)
[[1 1 0 1 1 3 0 1 0 1 1 2 0 0 0 0 1 0 2 0 1 1 3 1 1 1 1 0 1 1 0 1 1 0 1 0
  0 1 1 1 1 1 3 1]
 [0 0 1 0 0 2 1 0 1 1 0 1 2 1 1 1 0 1 0 1 0 0 0 0 0 0 0 2 1 0 1 0 0 1 0 1
  3 0 0 0 0 0 0 1]]


In [66]:
## TF-IDF 기반
tfldf=TfidfVectorizer()

In [67]:
tf_corpus=tfldf.fit_transform(corpus)

In [68]:
type(tf_corpus)

scipy.sparse.csr.csr_matrix

In [69]:
tf_corpus=tf_corpus.toarray()
print(tf_corpus)

[[0.13854203 0.13854203 0.         0.13854203 0.13854203 0.29572138
  0.         0.13854203 0.         0.09857379 0.13854203 0.19714759
  0.         0.         0.         0.         0.13854203 0.
  0.27708406 0.         0.13854203 0.13854203 0.41562608 0.13854203
  0.13854203 0.13854203 0.13854203 0.         0.09857379 0.13854203
  0.         0.13854203 0.13854203 0.         0.13854203 0.
  0.         0.13854203 0.13854203 0.13854203 0.13854203 0.13854203
  0.41562608 0.09857379]
 [0.         0.         0.17663888 0.         0.         0.25136004
  0.17663888 0.         0.17663888 0.12568002 0.         0.12568002
  0.35327777 0.17663888 0.17663888 0.17663888 0.         0.17663888
  0.         0.17663888 0.         0.         0.         0.
  0.         0.         0.         0.35327777 0.12568002 0.
  0.17663888 0.         0.         0.17663888 0.         0.17663888
  0.52991665 0.         0.         0.         0.         0.
  0.         0.12568002]]


In [72]:
sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [74]:
# 불용어 제거하고 토큰화하기

result=[]
tok=word_tokenize(sent)

for s in tok:
    if s not in en_stopwords:
        result.append(s)
        
print(result)

# 한 줄 버전
# len([s for s in tok if s not in en_stopwords])

['Wiki', 'Ward', 'original', 'description', ':', 'The', 'simplest', 'online', 'database', 'could', 'possibly', 'work.Wiki', 'piece', 'server', 'software', 'allows', 'users', 'freely', 'create', 'edit', 'Web', 'page', 'content', 'using', 'Web', 'browser', '.', 'Wiki', 'supports', 'hyperlinks', 'simple', 'text', 'syntax', 'creating', 'new', 'pages', 'crosslinks', 'internal', 'pages', 'fly.Wiki', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'allows', 'organization', 'contributions', 'edited', 'addition', 'content', 'itself.Like', 'many', 'simple', 'concepts', ',', '``', 'open', 'editing', "''", 'profound', 'subtle', 'effects', 'Wiki', 'usage', '.', 'Allowing', 'everyday', 'users', 'create', 'edit', 'page', 'Web', 'site', 'exciting', 'encourages', 'democratic', 'use', 'Web', 'promotes', 'content', 'composition', 'nontechnical', 'users', '.']


In [75]:
len(result)

85

## Tokenizer 객체 생성
---

In [7]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [81]:
raw_text='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [84]:
# 토큰으로 나누기
tokens=text_to_word_sequence(raw_text)
print(len(tokens), tokens, sep='\n')

128
['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', '

In [85]:
myToken=Tokenizer()

In [86]:
myToken.fit_on_texts(tokens)

In [88]:
print(myToken.word_index)

{'and': 1, 'wiki': 2, 'is': 3, 'in': 4, 'the': 5, 'that': 6, 'to': 7, 'web': 8, 'a': 9, 'of': 10, 'users': 11, 'content': 12, 'allows': 13, 'create': 14, 'edit': 15, 'page': 16, 'any': 17, 'has': 18, 'simple': 19, 'pages': 20, 'on': 21, 'it': 22, 'ward': 23, 'original': 24, 'description': 25, 'simplest': 26, 'online': 27, 'database': 28, 'could': 29, 'possibly': 30, 'work': 31, 'piece': 32, 'server': 33, 'software': 34, 'freely': 35, 'using': 36, 'browser': 37, 'supports': 38, 'hyperlinks': 39, 'text': 40, 'syntax': 41, 'for': 42, 'creating': 43, 'new': 44, 'crosslinks': 45, 'between': 46, 'internal': 47, 'fly': 48, 'unusual': 49, 'among': 50, 'group': 51, 'communication': 52, 'mechanisms': 53, 'organization': 54, 'contributions': 55, 'be': 56, 'edited': 57, 'addition': 58, 'itself': 59, 'like': 60, 'many': 61, 'concepts': 62, 'open': 63, 'editing': 64, 'some': 65, 'profound': 66, 'subtle': 67, 'effects': 68, 'usage': 69, 'allowing': 70, 'everyday': 71, 'site': 72, 'exciting': 73, 'enc

In [89]:
print(myToken.word_counts)

OrderedDict([('wiki', 5), ('is', 5), ('in', 5), ('ward', 1), ('original', 1), ('description', 1), ('the', 5), ('simplest', 1), ('online', 1), ('database', 1), ('that', 4), ('could', 1), ('possibly', 1), ('work', 1), ('a', 3), ('piece', 1), ('of', 3), ('server', 1), ('software', 1), ('allows', 2), ('users', 3), ('to', 4), ('freely', 1), ('create', 2), ('and', 6), ('edit', 2), ('web', 4), ('page', 2), ('content', 3), ('using', 1), ('any', 2), ('browser', 1), ('supports', 1), ('hyperlinks', 1), ('has', 2), ('simple', 2), ('text', 1), ('syntax', 1), ('for', 1), ('creating', 1), ('new', 1), ('pages', 2), ('crosslinks', 1), ('between', 1), ('internal', 1), ('on', 2), ('fly', 1), ('unusual', 1), ('among', 1), ('group', 1), ('communication', 1), ('mechanisms', 1), ('it', 2), ('organization', 1), ('contributions', 1), ('be', 1), ('edited', 1), ('addition', 1), ('itself', 1), ('like', 1), ('many', 1), ('concepts', 1), ('open', 1), ('editing', 1), ('some', 1), ('profound', 1), ('subtle', 1), ('ef

In [93]:
myToken.texts_to_sequences(raw_)

[[9]]

# Tokenizer 객체 -----------------------------
- 제공한 문서/문장에 대한 단어사전(voca)
- 단어사전(voca)에 존재하지 않는 단어 => Out Of Voca : oov

In [95]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'
]

In [109]:
tokenizer=Tokenizer()

# 단어 빈도 수가 높은 순으로 낮은 정수 인덱스 부여
tokenizer.fit_on_texts(sentences)

In [110]:
# 단어 인덱스 : 단어 인덱스
print(tokenizer.word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [111]:
# 단어 출력 개수
tokenizer.word_counts

OrderedDict([('i', 2),
             ('love', 3),
             ('my', 4),
             ('dog', 3),
             ('cat', 1),
             ('you', 2),
             ('do', 1),
             ('think', 1),
             ('is', 1),
             ('amazing', 1)])

In [112]:
# 문장을 생성된 사전(voca)를 기반으로 수치화
tokenizer.texts_to_sequences(sentences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

### 패딩(Padding)
---
- 길이가 모두 다른 문장들을 동일 길이로 맞추기 위한 과정
- 길이 기준 설정
- 긴 경우 => 앞/뒤 중 선택
- 짧은 경우 => 앞/뒤 중 선택
- 값 => 패딩에 들어갈 값

In [8]:
from tensorflow.keras.utils import pad_sequences

In [152]:
result=tokenizer.texts_to_sequences(sentences)

In [153]:
result

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

In [154]:
encoding=pad_sequences(result)
encoding

array([[ 0,  0,  0,  4,  2,  1,  3],
       [ 0,  0,  0,  4,  2,  1,  6],
       [ 0,  0,  0,  5,  2,  1,  3],
       [ 7,  5,  8,  1,  3,  9, 10]])

## One-Hot-Encoding 변환
---
- sklearn OneHotEncoder 객체 생성
- keras 함수

In [9]:
from tensorflow.keras.utils import to_categorical

In [114]:
seq_voca=tokenizer.texts_to_sequences(sentences)
print(f'seq_voca : {len(seq_voca)}')
print(seq_voca)

seq_voca : 4
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [115]:
to_categorical(seq_voca[3])

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [116]:
tokenizer.texts_to_matrix(sentences)

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1.]])

# ====================================

In [10]:
with open('../file/example.txt') as f:
    text=f.read()
    
text

'The main Henry Ford Museum building houses some of the classrooms for the Henry Ford Academy\n\n\nHenry Ford Academy is the first charter school in the United States to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution. The school is sponsored by the Ford Motor Company, Wayne County Regional Educational Service Agency and The Henry Ford Museum and admits high school students. It is located in Dearborn, Michigan on the campus of the Henry Ford museum. Enrollment is taken from a lottery in the area and totaled 467 in 2010.[1]\nFreshman meet inside the main museum building in glass walled classrooms, while older students use a converted carousel building and Pullman cars on a siding of the Greenfield Village railroad. Classes are expected to include use of the museum artifacts, a tradition of the original Village Schools. When the Museum was established in 1929, it included a school which served grades kindergarten to college/trade

In [138]:
# 토큰으로 나누기
tokens=text_to_word_sequence(text)
print(len(tokens), tokens, sep='\n')

246
['the', 'main', 'henry', 'ford', 'museum', 'building', 'houses', 'some', 'of', 'the', 'classrooms', 'for', 'the', 'henry', 'ford', 'academy', 'henry', 'ford', 'academy', 'is', 'the', 'first', 'charter', 'school', 'in', 'the', 'united', 'states', 'to', 'be', 'developed', 'jointly', 'by', 'a', 'global', 'corporation', 'public', 'education', 'and', 'a', 'major', 'nonprofit', 'cultural', 'institution', 'the', 'school', 'is', 'sponsored', 'by', 'the', 'ford', 'motor', 'company', 'wayne', 'county', 'regional', 'educational', 'service', 'agency', 'and', 'the', 'henry', 'ford', 'museum', 'and', 'admits', 'high', 'school', 'students', 'it', 'is', 'located', 'in', 'dearborn', 'michigan', 'on', 'the', 'campus', 'of', 'the', 'henry', 'ford', 'museum', 'enrollment', 'is', 'taken', 'from', 'a', 'lottery', 'in', 'the', 'area', 'and', 'totaled', '467', 'in', '2010', '1', 'freshman', 'meet', 'inside', 'the', 'main', 'museum', 'building', 'in', 'glass', 'walled', 'classrooms', 'while', 'older', 'stu

In [139]:
enToken=Tokenizer()

In [140]:
enToken.fit_on_texts(tokens)

In [141]:
# 단어 인덱스 : 단어 인덱스
print(enToken.word_index)

{'the': 1, 'in': 2, 'ford': 3, 'of': 4, 'henry': 5, 'school': 6, 'a': 7, 'and': 8, 'museum': 9, 'for': 10, 'is': 11, 'building': 12, 'academy': 13, 'to': 14, 'educational': 15, 'main': 16, 'classrooms': 17, 'charter': 18, 'by': 19, 'high': 20, 'students': 21, 'it': 22, 'on': 23, 'use': 24, 'village': 25, 'include': 26, 'original': 27, 'schools': 28, 'design': 29, 'international': 30, 'award': 31, 'facilities': 32, 'houses': 33, 'some': 34, 'first': 35, 'united': 36, 'states': 37, 'be': 38, 'developed': 39, 'jointly': 40, 'global': 41, 'corporation': 42, 'public': 43, 'education': 44, 'major': 45, 'nonprofit': 46, 'cultural': 47, 'institution': 48, 'sponsored': 49, 'motor': 50, 'company': 51, 'wayne': 52, 'county': 53, 'regional': 54, 'service': 55, 'agency': 56, 'admits': 57, 'located': 58, 'dearborn': 59, 'michigan': 60, 'campus': 61, 'enrollment': 62, 'taken': 63, 'from': 64, 'lottery': 65, 'area': 66, 'totaled': 67, '467': 68, '2010': 69, '1': 70, 'freshman': 71, 'meet': 72, 'inside

In [142]:
# 단어 출력 개수
enToken.word_counts

OrderedDict([('the', 25),
             ('main', 2),
             ('henry', 7),
             ('ford', 8),
             ('museum', 6),
             ('building', 4),
             ('houses', 1),
             ('some', 1),
             ('of', 8),
             ('classrooms', 2),
             ('for', 5),
             ('academy', 3),
             ('is', 5),
             ('first', 1),
             ('charter', 2),
             ('school', 7),
             ('in', 9),
             ('united', 1),
             ('states', 1),
             ('to', 3),
             ('be', 1),
             ('developed', 1),
             ('jointly', 1),
             ('by', 2),
             ('a', 7),
             ('global', 1),
             ('corporation', 1),
             ('public', 1),
             ('education', 1),
             ('and', 7),
             ('major', 1),
             ('nonprofit', 1),
             ('cultural', 1),
             ('institution', 1),
             ('sponsored', 1),
             ('motor', 1),
      

In [143]:
# 문장을 생성된 사전(voca)를 기반으로 수치화
enToken.texts_to_sequences(tokens)

[[1],
 [16],
 [5],
 [3],
 [9],
 [12],
 [33],
 [34],
 [4],
 [1],
 [17],
 [10],
 [1],
 [5],
 [3],
 [13],
 [5],
 [3],
 [13],
 [11],
 [1],
 [35],
 [18],
 [6],
 [2],
 [1],
 [36],
 [37],
 [14],
 [38],
 [39],
 [40],
 [19],
 [7],
 [41],
 [42],
 [43],
 [44],
 [8],
 [7],
 [45],
 [46],
 [47],
 [48],
 [1],
 [6],
 [11],
 [49],
 [19],
 [1],
 [3],
 [50],
 [51],
 [52],
 [53],
 [54],
 [15],
 [55],
 [56],
 [8],
 [1],
 [5],
 [3],
 [9],
 [8],
 [57],
 [20],
 [6],
 [21],
 [22],
 [11],
 [58],
 [2],
 [59],
 [60],
 [23],
 [1],
 [61],
 [4],
 [1],
 [5],
 [3],
 [9],
 [62],
 [11],
 [63],
 [64],
 [7],
 [65],
 [2],
 [1],
 [66],
 [8],
 [67],
 [68],
 [2],
 [69],
 [70],
 [71],
 [72],
 [73],
 [1],
 [16],
 [9],
 [12],
 [2],
 [74],
 [75],
 [17],
 [76],
 [77],
 [21],
 [24],
 [7],
 [78],
 [79],
 [12],
 [8],
 [80],
 [81],
 [23],
 [7],
 [82],
 [4],
 [1],
 [83],
 [25],
 [84],
 [85],
 [86],
 [87],
 [14],
 [26],
 [24],
 [4],
 [1],
 [9],
 [88],
 [7],
 [89],
 [4],
 [1],
 [27],
 [25],
 [28],
 [90],
 [1],
 [9],
 [91],
 [92],
 [2],
 

In [144]:
seq_voca=enToken.texts_to_sequences(tokens)
print(f'seq_voca : {len(seq_voca)}')
print(seq_voca)

seq_voca : 246
[[1], [16], [5], [3], [9], [12], [33], [34], [4], [1], [17], [10], [1], [5], [3], [13], [5], [3], [13], [11], [1], [35], [18], [6], [2], [1], [36], [37], [14], [38], [39], [40], [19], [7], [41], [42], [43], [44], [8], [7], [45], [46], [47], [48], [1], [6], [11], [49], [19], [1], [3], [50], [51], [52], [53], [54], [15], [55], [56], [8], [1], [5], [3], [9], [8], [57], [20], [6], [21], [22], [11], [58], [2], [59], [60], [23], [1], [61], [4], [1], [5], [3], [9], [62], [11], [63], [64], [7], [65], [2], [1], [66], [8], [67], [68], [2], [69], [70], [71], [72], [73], [1], [16], [9], [12], [2], [74], [75], [17], [76], [77], [21], [24], [7], [78], [79], [12], [8], [80], [81], [23], [7], [82], [4], [1], [83], [25], [84], [85], [86], [87], [14], [26], [24], [4], [1], [9], [88], [7], [89], [4], [1], [27], [25], [28], [90], [1], [9], [91], [92], [2], [93], [22], [94], [7], [6], [95], [96], [97], [98], [14], [99], [100], [6], [101], [1], [102], [103], [4], [1], [27], [6], [104], [2], [

In [145]:
seq_voca[0]

[1]

In [147]:
to_categorical(seq_voca[1])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.]], dtype=float32)

In [148]:
enToken.texts_to_matrix(tokens)

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
token_sent=sent_tokenize(text)
token_sent

['The main Henry Ford Museum building houses some of the classrooms for the Henry Ford Academy\n\n\nHenry Ford Academy is the first charter school in the United States to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution.',
 'The school is sponsored by the Ford Motor Company, Wayne County Regional Educational Service Agency and The Henry Ford Museum and admits high school students.',
 'It is located in Dearborn, Michigan on the campus of the Henry Ford museum.',
 'Enrollment is taken from a lottery in the area and totaled 467 in 2010.',
 '[1]\nFreshman meet inside the main museum building in glass walled classrooms, while older students use a converted carousel building and Pullman cars on a siding of the Greenfield Village railroad.',
 'Classes are expected to include use of the museum artifacts, a tradition of the original Village Schools.',
 'When the Museum was established in 1929, it included a school which served grades kin

In [13]:
len(token_sent)

12

In [23]:
fileToken=Tokenizer()

In [24]:
fileToken.fit_on_texts(token_sent)

In [27]:
seqData=fileToken.texts_to_sequences(token_sent)

In [28]:
seqData[0]

[1,
 16,
 5,
 3,
 9,
 12,
 33,
 34,
 4,
 1,
 17,
 10,
 1,
 5,
 3,
 13,
 5,
 3,
 13,
 11,
 1,
 35,
 18,
 6,
 2,
 1,
 36,
 37,
 14,
 38,
 39,
 40,
 19,
 7,
 41,
 42,
 43,
 44,
 8,
 7,
 45,
 46,
 47,
 48]