In [1]:
import tensorflow as tf

In [2]:
# 표제어 추출, 어간 추출
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/jeon-
[nltk_data]     yewon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print('표제어 추출 전 : ', words)
print('표제어 추출 후 : ', [lemmatizer.lemmatize(word) for word in words])

표제어 추출 전 :  ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
표제어 추출 후 :  ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [4]:
lemmatizer.lemmatize('dies', 'v')

'die'

In [5]:
lemmatizer.lemmatize('lives', 'v')

'live'

In [6]:
lemmatizer.lemmatize('has', 'v')

'have'

In [7]:
# 어간 추출

In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

sentence = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
tokenized_sentence = word_tokenize(sentence)

print('어간 추출 전 : ', tokenized_sentence)
print('어간 추출 후 : ', [stemmer.stem(word) for word in tokenized_sentence])


어간 추출 전 :  ['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']
어간 추출 후 :  ['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [9]:
# 불용어 제거
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print('불용어 개수 : ', len(stop_words))
print('불용어 예시 : ', stop_words[:10])

불용어 개수 :  179
불용어 예시 :  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [10]:
example = "Family is not an important thing. It's everything."

word_tokens = word_tokenize(example)

result = []

for word in word_tokens:
    if word not in stop_words:
        result.append(word)

print('불용어 제거 전 : ', word_tokens)
print('불용어 제거 후 : ', result)

불용어 제거 전 :  ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
불용어 제거 후 :  ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [11]:
import re

# . 임의의 문자 한 개
r = re.compile('a.c')
r.search('aaa')

In [12]:
print(r.search('abc'))
print(r.search('a-c'))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='a-c'>


In [13]:
# ? : 존재할 수도 있고, 존재하지 않을 수도 있을 경우
r = re.compile('ab?c')
r.search('aaa')

In [14]:
print(r.search('ac'))
print(r.search('abc'))

<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 3), match='abc'>


In [15]:
# * : (앞의 문자가) 0개 이상인 경우.
r = re.compile('ab*c')
r.search('aac')

<re.Match object; span=(1, 3), match='ac'>

In [16]:
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbbbbbbbbbc'))

<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 12), match='abbbbbbbbbbc'>


In [17]:
# + : (앞의 문자가) 1개 이상인 경우.
r = re.compile('ab+c')
r.search('aac')

In [18]:
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbbbbbbbbc'))

None
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 11), match='abbbbbbbbbc'>


In [19]:
# ^ : 문자열의 시작
r = re.compile('^ab')

r.search('aac')

In [20]:
print(r.search('cab'))
print(r.search('aac'))
print(r.search('ababab'))
print(r.search('abbbbbbbbb'))
print(r.search('abc'))

None
None
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 2), match='ab'>


In [21]:
# (숫자) : (앞의 문자가) '숫자 개'인 경우
r = re.compile('ab{2}c')

r.search('aac')

In [22]:
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbc'))
print(r.search('abbbbbbbbbc'))
print(r.search('abccc'))

None
None
<re.Match object; span=(0, 4), match='abbc'>
None
None


In [23]:
# {숫자1, 숫자2} : (앞의 문자가) 숫자1 이상 숫자2 이하의 개수인 경우
r = re.compile('ab{2, 5}c')

r.search('abc')

In [24]:
print(r.search('abbc'))
print(r.search('abbbbbc'))
print(r.search('abbbbbbbbbbc'))

None
None
None


In [25]:
# {숫자, } : (앞의 문자가) 해당 숫자 이상의 개수인 경우
# {0, } == *, {1, } == +
r = re.compile('a{2,}bc')

r.search('abc')

In [26]:
print(r.search('aabc'))
print(r.search('aaaaaaaaaaabc'))

<re.Match object; span=(0, 4), match='aabc'>
<re.Match object; span=(0, 13), match='aaaaaaaaaaabc'>


In [27]:
# [] : 범위 내에 포함된 문자가 있는 경우
r = re.compile('[abc]')
r.search('a')

<re.Match object; span=(0, 1), match='a'>

In [28]:
print(r.search('aaaaaaaaaaaaaaa'))
print(r.search('z'))
print(r.search('abc'))
print(r.search('cbac'))
print(r.search('zaca'))

<re.Match object; span=(0, 1), match='a'>
None
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='c'>
<re.Match object; span=(1, 2), match='a'>


In [29]:
# [a-z][A-Z][0-9]
r = re.compile('[a-z]')

r.search('AAA')

In [30]:
print(r.search('1234'))
print(r.search('A12B12'))
print(r.search('ABc'))
print(r.search('a12b12'))

None
None
<re.Match object; span=(2, 3), match='c'>
<re.Match object; span=(0, 1), match='a'>


In [31]:
# [^문자] : ^ 뒤에 있는 문자들을 제외시킨다.
r = re.compile('[^abc]')

r.search('abc')

In [32]:
print(r.search('a12b12'))
print(r.search('A12B12'))
print(r.search('z'))
print(r.search('1234'))

<re.Match object; span=(1, 2), match='1'>
<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 1), match='z'>
<re.Match object; span=(0, 1), match='1'>


In [33]:
# re.match() : 시작, re.search() : 전체
r = re.compile('ab.')

r.search('aaaaabc')

<re.Match object; span=(4, 7), match='abc'>

In [34]:
print(r.search('abc'))
print(r.search('aaabbcc'))
print(r.search('cba'))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(2, 5), match='abb'>
None


In [35]:
text = '정규 표현식 또는 정규식은 특정한 규칙을 가진 문자열의 집합'

re.split(' ', text)

['정규', '표현식', '또는', '정규식은', '특정한', '규칙을', '가진', '문자열의', '집합']

In [36]:
text = '''정규
표현식
또는
정규식은
특정한
규칙을
가진
문자열의
집합'''

re.split('\n', text)

['정규', '표현식', '또는', '정규식은', '특정한', '규칙을', '가진', '문자열의', '집합']

In [37]:
text = '정규+표현식+또는+정규식은+특정한+규칙을+가진+문자열의+집합'

re.split('\+', text)

['정규', '표현식', '또는', '정규식은', '특정한', '규칙을', '가진', '문자열의', '집합']

In [38]:
text = '''이름 : 홍길동
나이 : 20
전화번호 : 010-1234-5678
학교 : 길동대학교
'''

re.findall('\d+', text)

['20', '010', '1234', '5678']

In [39]:
re.findall('\d+', '정규 표현식 또는 정규식')

[]

In [40]:
# re.sub() : 정규 표현식과 일치하는 문자열을 다른 문자열로 대체
text = "Regular expression : A regular expression, regex or regexp[1] (sometimes called a rational expression)[2][3] is, in theoretical computer science and formal language theory, a sequence of characters that define a search pattern."

# 알파벳 제거
re.sub('[a-zA-Z]', '', text)

'  :   ,   [1] (    )[2][3] ,        ,         .'

In [41]:
text = """100 John    PROF
101 James   STUD
102 Mac   STUD"""

# 텍스트 전처리 예제
re.split('\s+', text)

['100', 'John', 'PROF', '101', 'James', 'STUD', '102', 'Mac', 'STUD']

In [42]:
re.findall('\d+', text)

['100', '101', '102']

In [43]:
re.findall('[A-Z]', text)

['J', 'P', 'R', 'O', 'F', 'J', 'S', 'T', 'U', 'D', 'M', 'S', 'T', 'U', 'D']

In [44]:
re.findall('[A-Z]{4}', text) # 대문자가 연속으로 4개 발생한 경우

['PROF', 'STUD', 'STUD']

In [45]:
re.findall('[A-Z][a-z]+', text)

['John', 'James', 'Mac']

### 문자 표현
### 문자 인식

In [46]:
from nltk.tokenize import sent_tokenize
raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

sentences = sent_tokenize(raw_text)
sentences

['A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [47]:
vocab = {}
index = 0

for text in sentences:
    for word in text.split():
        if word not in vocab:
            vocab[word] = index
            index += 1

print(vocab)

{'A': 0, 'barber': 1, 'is': 2, 'a': 3, 'person.': 4, 'good': 5, 'huge': 6, 'he': 7, 'Knew': 8, 'Secret!': 9, 'The': 10, 'Secret': 11, 'He': 12, 'Kept': 13, 'secret.': 14, 'Huge': 15, 'His': 16, 'kept': 17, 'his': 18, 'word.': 19, 'But': 20, 'keeping': 21, 'and': 22, 'such': 23, 'secret': 24, 'to': 25, 'himself': 26, 'was': 27, 'driving': 28, 'the': 29, 'crazy.': 30, 'went': 31, 'up': 32, 'mountain.': 33}


In [48]:
from nltk.tokenize import word_tokenize

vocab = {}
preprocessed_sentences = []
stop_words = set(stopwords.words('english'))

for sentence in sentences:
    tokenized = word_tokenize(sentence)
    result = []

    for word in tokenized:
        word = word.lower()
        if word not in stop_words:
            if len(word) > 2:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] += 1

    preprocessed_sentences.append(result)
preprocessed_sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [49]:
print(vocab)

{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}


In [50]:
vocab['barber']

8

In [51]:
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
print(sorted_vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [52]:
word_to_index = {}
i = 0

for (word, frequency) in sorted_vocab:
    if frequency > 1:
        i = i+1
        word_to_index[word] = i

print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [53]:
# person까지만 딕셔너리에 넣어주고 싶다면

In [54]:
word_frequency = [word for word, index in word_to_index.items() if index >= 5 + 1]

for w in word_frequency:
    del word_to_index[w]

word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}

In [55]:
word_to_index['NaN'] = len(word_to_index) + 1
word_to_index

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'NaN': 6}

In [56]:
encoded = []
for sentence in preprocessed_sentences:
    encoded_sentence = []
    for word in sentence:
        try:
            encoded_sentence.append(word_to_index[word])
        except KeyError:
            encoded_sentence.append(word_to_index['NaN'])

    encoded.append(encoded_sentence)

encoded

[[1, 5],
 [1, 6, 5],
 [1, 3, 5],
 [6, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [6, 6, 3, 2, 6, 1, 6],
 [1, 6, 3, 6]]

In [57]:
from collections import Counter
all_words = sum(preprocessed_sentences, [])
print(all_words)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [58]:
vocab = Counter(all_words)
print(vocab)

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [59]:
vocab = vocab.most_common(5)
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]


In [60]:
word_to_index = {}
i = 0
for (word, frequency) in vocab:
    i = i+1
    word_to_index[word] = i

print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [61]:
from nltk import FreqDist
import numpy as np

vocab = FreqDist(np.hstack(preprocessed_sentences))
print(np.hstack(preprocessed_sentences))
vocab

['barber' 'person' 'barber' 'good' 'person' 'barber' 'huge' 'person'
 'knew' 'secret' 'secret' 'kept' 'huge' 'secret' 'huge' 'secret' 'barber'
 'kept' 'word' 'barber' 'kept' 'word' 'barber' 'kept' 'secret' 'keeping'
 'keeping' 'huge' 'secret' 'driving' 'barber' 'crazy' 'barber' 'went'
 'huge' 'mountain']


FreqDist({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, ...})

In [62]:
vocab = vocab.most_common(5)
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]


In [63]:
word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(preprocessed_sentences)

In [65]:
print(tokenizer.word_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [66]:
print(tokenizer.word_counts)

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [67]:
print(tokenizer.texts_to_sequences(preprocessed_sentences))

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


In [68]:
tokenizer = Tokenizer(num_words = 6) # 상위 5개 단어
tokenizer.fit_on_texts(preprocessed_sentences)

In [69]:
print(tokenizer.word_counts)
print(tokenizer.word_index)

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [70]:
print(tokenizer.texts_to_sequences(preprocessed_sentences))

[[1, 5], [1, 5], [1, 3, 5], [2], [2, 4, 3, 2], [3, 2], [1, 4], [1, 4], [1, 4, 2], [3, 2, 1], [1, 3]]


In [71]:
tokenizer = Tokenizer(num_words = 6, oov_token = 'OOV') # Out of Vocabulary

In [72]:
tokenizer.fit_on_texts(preprocessed_sentences)

In [73]:
print(tokenizer.word_index['OOV'])

1


In [74]:
encoded = print(tokenizer.texts_to_sequences(preprocessed_sentences))
encoded

[[2, 1], [2, 1, 1], [2, 4, 1], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


In [75]:
# Example list to represent the 'encoded' variable
encoded = ['this', 'is', 'an', 'example', 'list']

# Ensure 'encoded' is not None and is iterable
if encoded is not None:
    max_len = max(len(item) for item in encoded)
    print(max_len)
else:
    print("The 'encoded' variable is None.")

7


In [76]:
def encode_sentences(sentences):
    # Example encoding process (could be tokenization, stemming, etc.)
    return [sentence for sentence in sentences if sentence]  # Filter out empty sentences

# Example sentences
sentences = ["This is a sentence.", "Another one.", "And yet another sentence."]

# Encode sentences
encoded = encode_sentences(sentences)

# Check if 'encoded' is not None before processing
if encoded is not None:
    max_len = max(len(item) for item in encoded)
    print(max_len)
else:
    print("The 'encoded' variable is None.")

25


## 패딩

In [115]:
import numpy as np
import pandas as pd

for sentence in encoded:
    while len(sentence) < max_len:
        sentence.append(0)

padding = np.array(encoded)
print(padding)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10961,) + inhomogeneous part.

In [116]:
padding = np.array(encoded)
padding

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10961,) + inhomogeneous part.

In [117]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded

[[10880, 147],
 [10880, 10, 147],
 [10880, 1343, 147],
 [629, 753],
 [753, 764, 1343, 753],
 [1343, 753],
 [10880, 764, 237],
 [10880, 764, 237],
 [10880, 764, 753],
 [1204, 1204, 1343, 753, 971, 10880, 4053],
 [10880, 162, 1343, 3320]]

In [118]:
padding = pad_sequences(encoded)
padding

array([[    0,     0,     0,     0,     0, 10880,   147],
       [    0,     0,     0,     0, 10880,    10,   147],
       [    0,     0,     0,     0, 10880,  1343,   147],
       [    0,     0,     0,     0,     0,   629,   753],
       [    0,     0,     0,   753,   764,  1343,   753],
       [    0,     0,     0,     0,     0,  1343,   753],
       [    0,     0,     0,     0, 10880,   764,   237],
       [    0,     0,     0,     0, 10880,   764,   237],
       [    0,     0,     0,     0, 10880,   764,   753],
       [ 1204,  1204,  1343,   753,   971, 10880,  4053],
       [    0,     0,     0, 10880,   162,  1343,  3320]], dtype=int32)

In [119]:
padding = pad_sequences(encoded, padding='post', maxlen=5) # post : 앞에서부터 채우기, maxlen : 전체 길이
padding

array([[10880,   147,     0,     0,     0],
       [10880,    10,   147,     0,     0],
       [10880,  1343,   147,     0,     0],
       [  629,   753,     0,     0,     0],
       [  753,   764,  1343,   753,     0],
       [ 1343,   753,     0,     0,     0],
       [10880,   764,   237,     0,     0],
       [10880,   764,   237,     0,     0],
       [10880,   764,   753,     0,     0],
       [ 1343,   753,   971, 10880,  4053],
       [10880,   162,  1343,  3320,     0]], dtype=int32)

In [120]:
padding = pad_sequences(encoded, padding='post', truncating='post', maxlen=5) # truncating : 뒤에서부터 데이터 삭제
padding

array([[10880,   147,     0,     0,     0],
       [10880,    10,   147,     0,     0],
       [10880,  1343,   147,     0,     0],
       [  629,   753,     0,     0,     0],
       [  753,   764,  1343,   753,     0],
       [ 1343,   753,     0,     0,     0],
       [10880,   764,   237,     0,     0],
       [10880,   764,   237,     0,     0],
       [10880,   764,   753,     0,     0],
       [ 1204,  1204,  1343,   753,   971],
       [10880,   162,  1343,  3320,     0]], dtype=int32)

In [121]:
from tensorflow.keras.utils import to_categorical

text = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
print(tokenizer.word_index)

{'the': 1, 'and': 2, 'in': 3, 'this': 4, 'was': 5, 'not': 6, 'map': 7, 'we': 8, 'found': 9, 'billy': 10, "bones's": 11, 'chest': 12, 'but': 13, 'an': 14, 'accurate': 15, 'copy': 16, 'complete': 17, 'all': 18, 'things': 19, 'names': 20, 'heights': 21, 'soundings': 22, 'with': 23, 'single': 24, 'exception': 25, 'of': 26, 'red': 27, 'crosses': 28, 'written': 29, 'notes': 30}


In [122]:
sub_text = 'the map found copy in all written'

encoded = tokenizer.texts_to_sequences([sub_text])[0]
encoded

[1, 7, 9, 16, 3, 18, 29]

In [123]:
one_hot = to_categorical(encoded)
one_hot

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)

In [124]:
# Bad of Words : 단어들의 순서를 전혀 고려하지 않고 출현 빈도에만 초점을 맞춘 수치화 방법
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want yout love. because i love you.']
vector = CountVectorizer()

print('Bag of Words : ', vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

Bag of Words :  [[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'yout': 5, 'love': 2, 'because': 0}


In [125]:
corpus = ['The cat sat on the mat', 'The dog ate my notebook', 'The cat chased the dog']

vector = CountVectorizer()

X = vector.fit_transform(corpus)
X

<3x10 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [126]:
print(vector.get_feature_names_out())
print(X.toarray())

['ate' 'cat' 'chased' 'dog' 'mat' 'my' 'notebook' 'on' 'sat' 'the']
[[0 1 0 0 1 0 0 1 1 2]
 [1 0 0 1 0 1 1 0 0 1]
 [0 1 1 1 0 0 0 0 0 2]]


In [127]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=['the', 'a', 'an', 'is', 'not'])
print('Bag of Words : ', vect.fit_transform(text).toarray())
print(vect.vocabulary_)

Bag of Words :  [[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [128]:
vect = CountVectorizer(stop_words='english')
print('Bag of Words : ', vect.fit_transform(text).toarray())
print(vect.vocabulary_)

Bag of Words :  [[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [129]:
stop_words = stopwords.words('english')
vect = CountVectorizer(stop_words = stop_words)
print('Bag of Words : ', vect.fit_transform(text).toarray())
print(vect.vocabulary_)

Bag of Words :  [[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [130]:
import pandas as pd
from math import log

docs = [
    '사과는 빨갛다',
    '사과는 맛있다',
    '바나나는 길다',
    '바나나는 맛있다',
    '기차는 길다'
]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
vocab

['기차는', '길다', '맛있다', '바나나는', '빨갛다', '사과는']

In [131]:
N = len(docs)

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc
    return log(N/(df+1))

def tfidf(t, d):
    return tf(t, d) * idf(t)

In [132]:
result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,기차는,길다,맛있다,바나나는,빨갛다,사과는
0,0,0,0,0,1,1
1,0,0,1,0,0,1
2,0,1,0,1,0,0
3,0,0,1,1,0,0
4,1,1,0,0,0,0


In [133]:
result = []

for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

Unnamed: 0,IDF
기차는,0.916291
길다,0.510826
맛있다,0.510826
바나나는,0.510826
빨갛다,0.916291
사과는,0.510826


In [134]:
result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t, d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,기차는,길다,맛있다,바나나는,빨갛다,사과는
0,0.0,0.0,0.0,0.0,0.916291,0.510826
1,0.0,0.0,0.510826,0.0,0.0,0.510826
2,0.0,0.510826,0.0,0.510826,0.0,0.0
3,0.0,0.0,0.510826,0.510826,0.0,0.0
4,0.916291,0.510826,0.0,0.0,0.0,0.0


In [135]:
from sklearn.feature_extraction.text import CountVectorizer

courpus = [
    'you know i want your love',
    'I like you',
    'what should i do'
]

vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 0 1 0 0 1 1 2]
 [1 0 0 1 0 1 1 0 0 1]
 [0 1 1 1 0 0 0 0 0 2]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'mat': 4, 'dog': 3, 'ate': 0, 'my': 5, 'notebook': 6, 'chased': 2}


In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfv = TfidfVectorizer().fit(corpus)

print(tfidfv.fit_transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.34101521 0.         0.         0.44839402 0.
  0.         0.44839402 0.44839402 0.52965746]
 [0.50461134 0.         0.         0.38376993 0.         0.50461134
  0.50461134 0.         0.         0.29803159]
 [0.         0.40352536 0.53058735 0.40352536 0.         0.
  0.         0.         0.         0.62674687]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'mat': 4, 'dog': 3, 'ate': 0, 'my': 5, 'notebook': 6, 'chased': 2}


In [137]:
sentences = ['The cat sat on the mat', 'The dog ate my notebook', 'The cat chased the dog']

tokenized_sentences = [sentence.lower().split() for sentence in sentences]
tokenized_sentences

[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'ate', 'my', 'notebook'],
 ['the', 'cat', 'chased', 'the', 'dog']]

In [138]:
from gensim.models import Word2Vec

model = Word2Vec(tokenized_sentences, vector_size=50, window=5, min_count=1, workers=4)

cat_vector = model.wv['cat']

cat_vector

array([-0.01723938,  0.00733148,  0.01037977,  0.01148388,  0.01493384,
       -0.01233535,  0.00221123,  0.01209456, -0.0056801 , -0.01234705,
       -0.00082045, -0.0167379 , -0.01120002,  0.01420908,  0.00670508,
        0.01445134,  0.01360049,  0.01506148, -0.00757831, -0.00112361,
        0.00469675, -0.00903806,  0.01677746, -0.01971633,  0.01352928,
        0.00582883, -0.00986566,  0.00879638, -0.00347915,  0.01342277,
        0.0199297 , -0.00872489, -0.00119868, -0.01139127,  0.00770164,
        0.00557325,  0.01378215,  0.01220219,  0.01907699,  0.01854683,
        0.01579614, -0.01397901, -0.01831173, -0.00071151, -0.00619968,
        0.01578863,  0.01187715, -0.00309133,  0.00302193,  0.00358008],
      dtype=float32)

In [139]:
similar_to_cat = model.wv.most_similar('cat')
similar_to_cat

[('notebook', 0.16563552618026733),
 ('on', 0.13940520584583282),
 ('the', 0.1267007440328598),
 ('ate', 0.08872982859611511),
 ('dog', 0.011071977205574512),
 ('my', -0.027841340750455856),
 ('sat', -0.03727477416396141),
 ('chased', -0.15515567362308502),
 ('mat', -0.2187293916940689)]

In [140]:
import nltk
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

dataset = fetch_20newsgroups(shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print('총 샘플 수 : ',len(documents))

총 샘플 수 :  11314


In [141]:
news_df = pd.DataFrame({'document':documents})

news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ') # 특수문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 길이가 3 이하인 단어 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 소문자 변환

In [142]:
news_df.isnull().values.any()

False

In [143]:
news_df.replace('', float('NaN'), inplace=True)
news_df.isnull().values.any()

True

In [144]:
news_df.dropna(inplace=True)
print('총 샘플 수 : ', len(news_df))

총 샘플 수 :  11004


In [145]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x : x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

In [146]:
# Indices of sentences with length 1 or less
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]

# Remove those sentences from tokenized_doc
tokenized_doc = [sentence for index, sentence in enumerate(tokenized_doc) if index not in drop_train]

# Print the total number of remaining samples
print('총 샘플 수 : ', len(tokenized_doc))

총 샘플 수 :  10961


In [147]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {value : key for key, value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

print(encoded[:3])

[[913, 19, 8, 9872, 1017, 39306, 2032, 2033, 497, 890, 63435, 375, 24007, 84, 63436, 3940, 25, 7921, 1526, 462, 8790, 999, 381, 3317, 847, 19, 63437, 455, 1998, 748, 15889, 64, 13041, 4231, 1640, 515, 288, 24008, 109, 2033, 28, 2816], [1226, 42, 8791, 6911, 4335, 1309, 8328, 1581, 2865, 39307, 28, 83, 3186, 259, 11172, 2865, 17773, 317, 39308, 4473, 3187, 781, 24009, 1662, 63438, 2385, 2458, 1403, 726, 163, 4033, 5710, 347, 20477, 24010, 106, 1999, 28, 573, 706, 816, 132, 1309, 1431, 2227, 39307, 882], [40, 3679, 533, 843, 471, 5345, 3852, 871, 199, 7520, 37, 9873, 63439, 731, 328, 5711, 902, 24011, 109, 4591, 3941, 120, 1192, 279, 20478, 883, 7521, 286, 5, 5923, 106, 1357, 3941, 63440, 39309, 139, 63441, 616, 12, 39310, 2180, 63442, 139, 7522, 20, 213, 39311, 913, 286, 24012, 286, 139, 4127, 303, 5712, 3941, 129, 2, 498, 29726, 162, 24013, 437, 2965, 305, 63443, 8, 60, 7523, 11173, 163, 90, 29727, 25, 210, 13, 13042, 305, 63444, 411, 102, 14287, 63445, 25, 63446, 8, 20479, 914, 3, 142

In [148]:
print('단어 개수 : ', len(word2idx) + 1)

단어 개수 :  181839


In [149]:
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size=len(word2idx)+1, window_size=10) for sample in encoded[:10]]

In [150]:
paris, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print('({:s}) ({:d}), {:s} ({:d})) -> {:d}'.format(
        idx2word[paris[i][0]], paris[i][0],
        idx2word[paris[i][1]], paris[i][1],
        labels[i]))

(front) (462), late (890)) -> 1
(whatever) (515), embarassing.2nd.amendment (100673)) -> 0
(60s/) (63435), day. (1017)) -> 1
(know.) (847), addition, (1526)) -> 1
(addition,) (1526), model (455)) -> 1


In [151]:
print('전체 샘플 수 : ', len(skip_grams))

전체 샘플 수 :  10


In [152]:
print(len(paris), len(labels))

1460 1460


In [153]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [154]:
dim = 100

w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(len(word2idx)+1, dim)(w_inputs)

c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding = Embedding(len(word2idx)+1, dim)(c_inputs)

dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[w_inputs, c_inputs], outputs = output)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 1, 100)               1818390   ['input_3[0][0]']             
                                                          0                                       
                                                                                                  
 embedding_3 (Embedding)     (None, 1, 100)               1818390   ['input_4[0][0]']       

In [155]:
model.compile(loss='binary_crossentropy', optimizer='adam')

In [156]:
for epoch in range(1, 6):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)

    print('Epoch : ', epoch, 'Loss : ', loss)

Epoch :  1 Loss :  6.931556463241577
Epoch :  2 Loss :  6.931489825248718
Epoch :  3 Loss :  6.93148010969162
Epoch :  4 Loss :  6.931474685668945
Epoch :  5 Loss :  6.931472718715668


In [157]:
import gensim

f = open('vectors.txt', 'w', encoding='utf-8')
f.write('{} {}\n'.format(len(word2idx), 100))
vectors = model.get_weights()[0]

for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [158]:
w2v.most_similar(positive=['engine'])

[('stereogram', 0.48528584837913513),
 ('superpowers,', 0.44478344917297363),
 ('n_0!=za#;ss', 0.41870638728141785),
 ('sanctioning', 0.41490331292152405),
 ("could've", 0.40699246525764465),
 ('flowing', 0.3929716646671295),
 ('spart.par', 0.3913535475730896),
 ('disclaimed', 0.3847351372241974),
 ('pinched', 0.3830842971801758),
 ('"effectively".', 0.3813590705394745)]