# Integer Encoding
문장을 구성하는 단어들에 대해 숫자 부여하기

보통 ABC순 또는 빈도가 높은순으로 구성

나중에 딥러닝할 때 빈도가 높은 순 사용

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize
text = """Isn't she lovely.
Isn't she wonderful.
Isn't she precious.
Less than one minute old.
I never thought through love we'd be.
Making one as lovely as she.
But isn't she lovely made from love.
Isn't she pretty.
Truly the angel's best.
Boy, I'm so happy.
We have been heaven blessed.
I can't believe what God has done.
Through us he's given life to one.
But isn't she lovely made from love.
Isn't she lovely.
Life and love are the same.
Life is Aisha.
The meaning of her name.
Londie, it could have not been done.
Without you who conceived the one.
That's so very lovely made from love."""

text = sent_tokenize(text)
print(text)

["Isn't she lovely.", "Isn't she wonderful.", "Isn't she precious.", 'Less than one minute old.', "I never thought through love we'd be.", 'Making one as lovely as she.', "But isn't she lovely made from love.", "Isn't she pretty.", "Truly the angel's best.", "Boy, I'm so happy.", 'We have been heaven blessed.', "I can't believe what God has done.", "Through us he's given life to one.", "But isn't she lovely made from love.", "Isn't she lovely.", 'Life and love are the same.', 'Life is Aisha.', 'The meaning of her name.', 'Londie, it could have not been done.', 'Without you who conceived the one.', "That's so very lovely made from love."]


# Word Tokenization

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sentences = []
stop_words = set(stopwords.words('english'))  # NLTK 불용어
# 패턴에 대한 순서 연습 필요
# 토큰화 -> 불용어 처리 등 정제 작업

# 문장별 단어 토큰화
for i in text:
  sentence = word_tokenize(i)  # 단어 토큰화
  result = []
  
  # 정제 작업 수행
  for word in sentence:
    word = word.lower()  # 모든 단어의 알파벳을 소문자로 바꿔주기  # 단어마다 소문자화

    # 불용어 제거
    if word not in stop_words:
      if len(word) > 2:  # 단어의 길이가 2이하인 경우, 단어 제거
        result.append(word)
    
  sentences.append(result)
print(sentences)

[["n't", 'lovely'], ["n't", 'wonderful'], ["n't", 'precious'], ['less', 'one', 'minute', 'old'], ['never', 'thought', 'love'], ['making', 'one', 'lovely'], ["n't", 'lovely', 'made', 'love'], ["n't", 'pretty'], ['truly', 'angel', 'best'], ['boy', 'happy'], ['heaven', 'blessed'], ["n't", 'believe', 'god', 'done'], ['given', 'life', 'one'], ["n't", 'lovely', 'made', 'love'], ["n't", 'lovely'], ['life', 'love'], ['life', 'aisha'], ['meaning', 'name'], ['londie', 'could', 'done'], ['without', 'conceived', 'one'], ['lovely', 'made', 'love']]


In [None]:
# 불용어, 단어의 길이가 2이하인 것들은 다 사라짐

## 단어 집합 만들기(Python)
배열을 풀어야지 각 단어들마다 빈도수를 셀 수 있음

In [None]:
from collections import Counter  # 배열에 있는 원소의 갯수를 세서 딕셔너리화 해준다.
# 2차원 배열 형식의 단어 집합을 1차원으로 풀어준다.
words = sum(sentences, [])
print(words)
# for문, comprehension 같은 것 사용해도 됨

["n't", 'lovely', "n't", 'wonderful', "n't", 'precious', 'less', 'one', 'minute', 'old', 'never', 'thought', 'love', 'making', 'one', 'lovely', "n't", 'lovely', 'made', 'love', "n't", 'pretty', 'truly', 'angel', 'best', 'boy', 'happy', 'heaven', 'blessed', "n't", 'believe', 'god', 'done', 'given', 'life', 'one', "n't", 'lovely', 'made', 'love', "n't", 'lovely', 'life', 'love', 'life', 'aisha', 'meaning', 'name', 'londie', 'could', 'done', 'without', 'conceived', 'one', 'lovely', 'made', 'love']


In [None]:
# 배열 안 쪽에 있는 걸 다 풀어서
# 오른쪽에 입력한 것 : []
# 안에 다 넣겠다.

In [None]:
# Counter 가 단어 집합이 됨
# 횟수를 셀거니까
vocab = Counter(words)
print(vocab)

Counter({"n't": 8, 'lovely': 6, 'love': 5, 'one': 4, 'made': 3, 'life': 3, 'done': 2, 'wonderful': 1, 'precious': 1, 'less': 1, 'minute': 1, 'old': 1, 'never': 1, 'thought': 1, 'making': 1, 'pretty': 1, 'truly': 1, 'angel': 1, 'best': 1, 'boy': 1, 'happy': 1, 'heaven': 1, 'blessed': 1, 'believe': 1, 'god': 1, 'given': 1, 'aisha': 1, 'meaning': 1, 'name': 1, 'londie': 1, 'could': 1, 'without': 1, 'conceived': 1})


In [None]:
# 딕셔너리를 활용해서
# 단어가 몇 회씩 등장했는 지를 세줌

In [None]:
print(vocab["lovely"])

6


# Integer Encoding 수행

In [None]:
# 빈도수가 높은 순서대로 정렬하기
vocab_sorted = sorted(vocab.items(), key=lambda x: x[1], reverse = True)
print(vocab_sorted)

[("n't", 8), ('lovely', 6), ('love', 5), ('one', 4), ('made', 3), ('life', 3), ('done', 2), ('wonderful', 1), ('precious', 1), ('less', 1), ('minute', 1), ('old', 1), ('never', 1), ('thought', 1), ('making', 1), ('pretty', 1), ('truly', 1), ('angel', 1), ('best', 1), ('boy', 1), ('happy', 1), ('heaven', 1), ('blessed', 1), ('believe', 1), ('god', 1), ('given', 1), ('aisha', 1), ('meaning', 1), ('name', 1), ('londie', 1), ('could', 1), ('without', 1), ('conceived', 1)]


In [None]:
# sorted(정렬대상(리스트 같은 것) - items(key, value를 튜플 형태로 만들어줌), 
#                  key : 정렬의 기준(x에 item 하나하나가 들어옴, x[1] : 횟수, x[0] : 단어 그 중 횟수), 
#                  reverse = True : 내림차순 정렬)

In [None]:
# 높은 빈도수를 가진 단어일수록 낮은 정수 인덱스를 부여
word2idx = {}
i = 0
# enumerate, zip 같은 것 활용해도 됨
for (word, frequency) in vocab_sorted:
  
  # 빈도수 이용한 정제 작업
  if frequency > 1:
    i = i + 1
    word2idx[word] = i
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'life': 6, 'done': 7}


In [None]:
# 이게 정수 인코딩

In [None]:
# 패턴 익히기, 순서가 중요

# 이론은 쉬운데 코딩은 어려움
# 연습 필요

단어를 모두 사용하는 것이 아닌, **빈도수 상위 top5**만 사용하고 싶다면?

In [None]:
vocab_size = 5
# 인덱스 vocab_size를 초과하는 모든 단어의 목록 획득
words_frequency = [w for w,c in word2idx.items() if c >= vocab_size + 1]
for w in words_frequency:
  del word2idx[w]
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5}


실제 텍스트를 정수로 표현하기

In [None]:
# oov를 처리하기 위해 UNK 토큰 추가
word2idx['UNK'] = 6
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'UNK': 6}


In [None]:
# 단어 집합에 없는 단어는 이제 전부 UNK 로 들어온다.

In [None]:
# 정수 인코딩을 한 결과를 넣기 위한 배열을 미리 만들어놓자.
encoded = []
for s in sentences:
  temp = []
  for w in s:
    if w in word2idx:
      temp.append(word2idx[w])
    else:
      temp.append(word2idx['UNK'])
  encoded.append(temp)
print("변환 전 : {}".format(sentences[:5]))
print("변환 후 : {}".format(encoded[:5]))

변환 전 : [["n't", 'lovely'], ["n't", 'wonderful'], ["n't", 'precious'], ['less', 'one', 'minute', 'old'], ['never', 'thought', 'love']]
변환 후 : [[1, 2], [1, 6], [1, 6], [6, 4, 6, 6], [6, 6, 3]]


# Vocab & Integer Encoding을 Tensorflow로

In [None]:
print(sentences)

[["n't", 'lovely'], ["n't", 'wonderful'], ["n't", 'precious'], ['less', 'one', 'minute', 'old'], ['never', 'thought', 'love'], ['making', 'one', 'lovely'], ["n't", 'lovely', 'made', 'love'], ["n't", 'pretty'], ['truly', 'angel', 'best'], ['boy', 'happy'], ['heaven', 'blessed'], ["n't", 'believe', 'god', 'done'], ['given', 'life', 'one'], ["n't", 'lovely', 'made', 'love'], ["n't", 'lovely'], ['life', 'love'], ['life', 'aisha'], ['meaning', 'name'], ['londie', 'could', 'done'], ['without', 'conceived', 'one'], ['lovely', 'made', 'love']]


keras.preprocessing.text.Tokenizer 제공
- fit_on_texts를 사용하면 입력된 텍스트로부터 **단어 빈도수**가 높은 순으로 낮은 정수 인덱스를 부여

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'life': 6, 'done': 7, 'wonderful': 8, 'precious': 9, 'less': 10, 'minute': 11, 'old': 12, 'never': 13, 'thought': 14, 'making': 15, 'pretty': 16, 'truly': 17, 'angel': 18, 'best': 19, 'boy': 20, 'happy': 21, 'heaven': 22, 'blessed': 23, 'believe': 24, 'god': 25, 'given': 26, 'aisha': 27, 'meaning': 28, 'name': 29, 'londie': 30, 'could': 31, 'without': 32, 'conceived': 33}


In [None]:
# 각 단어의 빈도수 확인
print(tokenizer.word_counts)

OrderedDict([("n't", 8), ('lovely', 6), ('wonderful', 1), ('precious', 1), ('less', 1), ('one', 4), ('minute', 1), ('old', 1), ('never', 1), ('thought', 1), ('love', 5), ('making', 1), ('made', 3), ('pretty', 1), ('truly', 1), ('angel', 1), ('best', 1), ('boy', 1), ('happy', 1), ('heaven', 1), ('blessed', 1), ('believe', 1), ('god', 1), ('done', 2), ('given', 1), ('life', 3), ('aisha', 1), ('meaning', 1), ('name', 1), ('londie', 1), ('could', 1), ('without', 1), ('conceived', 1)])


In [None]:
# 정수 인코딩 수행
print(tokenizer.texts_to_sequences(sentences))  # 원본 토큰화된 단어 배열 대입

[[1, 2], [1, 8], [1, 9], [10, 4, 11, 12], [13, 14, 3], [15, 4, 2], [1, 2, 5, 3], [1, 16], [17, 18, 19], [20, 21], [22, 23], [1, 24, 25, 7], [26, 6, 4], [1, 2, 5, 3], [1, 2], [6, 3], [6, 27], [28, 29], [30, 31, 7], [32, 33, 4], [2, 5, 3]]


상위 n개의 단어만 사용하기

In [None]:
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1)  # 상위 5개의 단어만 사용
tokenizer.fit_on_texts(sentences)

In [None]:
print(tokenizer.word_index)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'life': 6, 'done': 7, 'wonderful': 8, 'precious': 9, 'less': 10, 'minute': 11, 'old': 12, 'never': 13, 'thought': 14, 'making': 15, 'pretty': 16, 'truly': 17, 'angel': 18, 'best': 19, 'boy': 20, 'happy': 21, 'heaven': 22, 'blessed': 23, 'believe': 24, 'god': 25, 'given': 26, 'aisha': 27, 'meaning': 28, 'name': 29, 'londie': 30, 'could': 31, 'without': 32, 'conceived': 33}


In [None]:
print(tokenizer.texts_to_sequences(sentences))
# padding까지 고려되서 1이 되었음

[[1, 2], [1], [1], [4], [3], [4, 2], [1, 2, 5, 3], [1], [], [], [], [1], [4], [1, 2, 5, 3], [1, 2], [3], [], [], [], [4], [2, 5, 3]]


In [None]:
# padding까지 고려하면 vocab_size + 1
# padding, oov까지 고려하면 vocab_size+2
# 근데 보통은 둘 다 고려하니까 보통 내가 사용할 vocabulary의 size + 2를 주로 사용한다.

In [None]:
# padding 은 sentence의 길이 중 제일 긴 걸 기준으로 해서
# 길이가 모자란건 0으로 채워주는 것

In [None]:
# OOV, PAD 고려
tokenizer = Tokenizer(num_words=vocab_size + 2,
                      oov_token='OOV')  # oov 토큰 따로 지정 가능  # '<OOV>' 또는 'OOV' 또는 'UNK'
tokenizer.fit_on_texts(sentences)
print(tokenizer.texts_to_sequences(sentences))

[[2, 3], [2, 1], [2, 1], [1, 5, 1, 1], [1, 1, 4], [1, 5, 3], [2, 3, 6, 4], [2, 1], [1, 1, 1], [1, 1], [1, 1], [2, 1, 1, 1], [1, 1, 5], [2, 3, 6, 4], [2, 3], [1, 4], [1, 1], [1, 1], [1, 1, 1], [1, 1, 5], [3, 6, 4]]


In [None]:
print("OOV의 인덱스 : {}".format(tokenizer.word_index['OOV']))

OOV의 인덱스 : 1


# 한국어 문장 토큰화 및 정수 인코딩 하기

In [65]:
!pip install konlpy



In [64]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 72 (delta 31), reused 20 (delta 5), pack-reused 0[K
Unpacking objects: 100% (72/72), done.
/content/Mecab-ko-for-Google-Colab/Mecab-ko-for-Google-Colab
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2020-11-12 01:06:07--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 104.192.141.1, 2406:da00:ff00::22c5:2ef4, 2406:da00:ff00::22c0:3470, ...
Connecting to bitbucket.org (bitbucket.org)|104.192.141.1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar

In [None]:
import pandas as pd
import numpy as np
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7f66c7009710>)

In [None]:
train_data = pd.read_table("ratings_test.txt")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [None]:
# document 에 NaN 값이 3개 있다 -> 삭제를 한다

In [None]:
train_data.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


1. nan값 확인 후 제거
2. 한글 정제(영어, 특수문자 등 제거) - soynlp 등을 활용하여 반복되는 문자들 정제 등등
3. 중복 데이터가 있는 지 검사 (선택사항)

In [None]:
# train_data['document']  # 이렇게 하면 시리즈
# train_data['document'].str.replace("정규식", "바꿀거")  # 이렇게 하면 됨

In [None]:
# 중복제거부터 하자
train_data.drop_duplicates(subset=["document"], inplace=True)
train_data["document"].nunique()

49157

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49158 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49158 non-null  int64 
 1   document  49157 non-null  object
 2   label     49158 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [None]:
# 원래 데이터 5만개였는 데
# 중복값이 사라진 것을 확인

In [None]:
# 하지만 documnet 컬럼에 nan 값이 있는 것으로 보임
# null 값 제거하자.
train_data = train_data.dropna(how="any")  # nan값이 하나라도 있으면 그 데이터 삭제

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49157 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49157 non-null  int64 
 1   document  49157 non-null  object
 2   label     49157 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [None]:
# null값이 사라져서 모든 컬럼에 해당하는 non-null 데이터의 갯수가 동일해진 것 확인

In [None]:
train_data["document"] = train_data["document"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]", "")
# 한글이 아닌 값은 제거한다.
train_data.head()

Unnamed: 0,id,document,label
0,6270596,굳ㅋ,1
1,9274899,,0
2,8544678,뭐야이평점들은나쁘진않지만점짜리는더더욱아니잖아,0
3,6825595,지루하지는않은데완전막장임돈주고보기에는,0
4,6723715,만아니었어도별다섯개줬을텐데왜로나와서제심기를불편하게하죠,0


In [None]:
# GDNTOPCLASSINTHECLUB 데이터가 한글이 아니라서 없어진 것 확인가능
# 하지만 그냥 빈칸이라서 nan값으로 채워준 다음에 그 행 자체를 없애주자.

In [None]:
train_data["document"].replace("", np.nan, inplace=True)
print(train_data.isnull().sum())
# null값의 총 갯수 구하기
# replace된 nan값 확인
# 305개의 값 제거됨

id            0
document    305
label         0
dtype: int64


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49157 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49157 non-null  int64 
 1   document  48852 non-null  object
 2   label     49157 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [None]:
# 이제 이 nan값이 포함되어있는 데이터. 행 전체 제거
train_data = train_data.dropna(how="any")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48852 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        48852 non-null  int64 
 1   document  48852 non-null  object
 2   label     48852 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [None]:
train_data.head()

Unnamed: 0,id,document,label
0,6270596,굳ㅋ,1
2,8544678,뭐야이평점들은나쁘진않지만점짜리는더더욱아니잖아,0
3,6825595,지루하지는않은데완전막장임돈주고보기에는,0
4,6723715,만아니었어도별다섯개줬을텐데왜로나와서제심기를불편하게하죠,0
5,7898805,음악이주가된최고의음악영화,1


In [None]:
# 불용어
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
# 이제 불용어 제거를 해보자.
from konlpy.tag import Okt
okt = Okt()

In [None]:
X_train = []
for sentence in train_data["document"]:
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True)
  temp_X = [word for word in temp_X if not word in stopwords]  # 불용어 제거
  X_train.append(temp_X)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
X_train

[['굳다', 'ㅋ'],
 ['뭐', '야', '평점', '나쁘다', '않다', '만점', '짜다', '리', '더', '더욱', '아니다'],
 ['지루하다', '않다', '완전', '막장', '임돈', '주다', '보기', '에는'],
 ['만', '아니다', '별', '다섯', '개', '주다', '왜', '로나', '서다', '제', '심기', '불편하다'],
 ['음악', '주가', '되다', '최고', '음악', '영화'],
 ['진정하다', '쓰레기'],
 ['마치',
  '미국',
  '애니',
  '에서',
  '튀어나오다',
  '창의력',
  '없다',
  '로봇',
  '디자인',
  '부터가',
  '고개',
  '젖다',
  '다'],
 ['갈수록',
  '개판',
  '되다',
  '중국영화',
  '유치하다',
  '없다',
  '폼',
  '잡다',
  '끝나다',
  '안되다',
  '무기',
  '유치하다',
  '남무',
  '아',
  '그리다',
  '동사서독',
  '같다',
  '영화',
  '이건',
  '류',
  '아',
  '류작',
  '이다'],
 ['이별', '아픔', '뒤', '찾아오다', '새롭다', '인연', '기쁨', '모든', '사람', '그렇다', '않다'],
 ['괜찮다', '오랜', '만', '포켓몬스터', '잼밌', '어', '요'],
 ['한국', '독립영화', '한계', '그렇게', '아버지', '되다', '비교', '되다'],
 ['청춘',
  '아름답다',
  '다그',
  '아름답다',
  '움',
  '이성',
  '을',
  '흔들다',
  '찰나',
  '아름답다',
  '움',
  '을',
  '자다',
  '포착',
  '섬세하다',
  '아름답다',
  '수채화',
  '같다',
  '퀴어',
  '영화',
  '이다'],
 ['눈', '보이다', '반전', '이다', '영화', '흡인', '력', '사라지다', '않다'],
 ['스토리',
  '연',
  '출연',
 

# TF-IDF
- TF : Term Frequency
- IDF : Inverse Document Frequency

In [None]:
sample = "hello bye bye"
sample.count("bye")

2

In [104]:
# n번째 문서(documnet)에서 단어(term)이 등장한 횟수
def term_frequency(term, document):
  return document.count(term)
# 단어(term)가 문서**들**(documents)에서 등장한 횟수
def document_frequency(term, documents):
  term_count = 0
  for document in documents:
    term_count += (term in document)  # 오른쪽 항 : True/False -> 1/0  # += 1 의 역할을 함
  return term_count

def inverse_document_frequency(term, documents):
  from math import log

  N = len(documents)
  df = document_frequency(term, documents)
  
  return log(N / (1+df))

# idx번째 문서에 term에 대한 tf-idf를 구해야 한다.
def tf_idf(term, documents, idx):
  # 문서가 중요
  # 어떠한 문서에 단어가 몇 번 등장했나
  document = documents[idx]
  return term_frequency(term, document) * inverse_document_frequency(term, documents)

In [105]:
docs = [
  '동해 물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 동해 가고 싶다',
  '남산 위에 저 소나무, 철갑을 두른 듯 바람 서리 불변함은 우리 기상일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 소나무 이쁘다',
  '가을 하늘 공활한데 높고 구름 없이 밝은 달은 우리 가슴 일편단심일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 가을 하늘 보고 싶다.',
  '이 기상과 이 마음으로 충성을 다하여 괴로우나 즐거우나 나라 사랑하세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 나라를 사랑하자'
] 

In [106]:
from konlpy.tag import Mecab
mecab = Mecab()

vocab = list(set(w for doc in docs for w in mecab.nouns(doc)))
vocab.sort()

In [107]:
print(vocab)

['가슴', '가을', '강산', '구름', '기상', '길', '나라', '남산', '달', '대한', '데', '동해', '듯', '마음', '만세', '무궁화', '물', '바람', '백두산', '보우', '보전', '불변', '사람', '사랑', '삼천리', '서리', '소나무', '우리', '위', '일편단심', '철갑', '충성', '하느님', '하늘', '화려']


In [108]:
# DTM(TF) 만들기
# 각 **문서**가 기준이 됨
import pandas as pd

result = []  # 데이터 프레임의 행 데이터를 담아낼 배열

for i in range(len(docs)):
  result.append([])
  d = docs[i] # 문서

  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(term_frequency(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,가슴,가을,강산,구름,기상,길,나라,남산,달,대한,데,동해,듯,마음,만세,무궁화,물,바람,백두산,보우,보전,불변,사람,사랑,삼천리,서리,소나무,우리,위,일편단심,철갑,충성,하느님,하늘,화려
0,0,0,1,0,0,1,1,0,0,2,0,2,0,0,1,1,1,0,1,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,1
1,0,0,1,0,1,1,0,1,0,2,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,2,1,1,0,1,0,0,0,1
2,1,2,1,1,0,1,0,0,1,2,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,2,1
3,0,0,1,0,1,1,2,0,0,2,0,0,0,1,0,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,0,1,0,0,1


In [109]:
# TF-IDF 데이터 프레임 만들기
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(inverse_document_frequency(t, docs))

idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_.head()

Unnamed: 0,IDF
가슴,0.693147
가을,0.693147
강산,-0.223144
구름,0.693147
기상,0.287682


In [110]:
result = []
N = len(docs)
for i in range(N):
    result.append([])
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf_idf(t, docs, i))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,가슴,가을,강산,구름,기상,길,나라,남산,달,대한,데,동해,듯,마음,만세,무궁화,물,바람,백두산,보우,보전,불변,사람,사랑,삼천리,서리,소나무,우리,위,일편단심,철갑,충성,하느님,하늘,화려
0,0.0,0.0,-0.223144,0.0,0.0,-0.223144,0.287682,0.0,0.0,-0.446287,0.0,1.386294,0.0,0.0,0.693147,-0.223144,0.693147,0.0,0.693147,0.693147,-0.223144,0.0,-0.223144,0.0,-0.223144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,-0.223144
1,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.0,0.693147,0.0,-0.446287,0.0,0.0,0.693147,0.0,0.0,-0.223144,0.0,0.693147,0.0,0.0,-0.223144,0.693147,-0.223144,0.0,-0.223144,0.693147,1.386294,0.0,0.693147,0.0,0.693147,0.0,0.0,0.0,-0.223144
2,0.693147,1.386294,-0.223144,0.693147,0.0,-0.223144,0.0,0.0,0.693147,-0.446287,0.693147,0.0,0.0,0.0,0.0,-0.223144,0.0,0.0,0.0,0.0,-0.223144,0.0,-0.223144,0.0,-0.223144,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,1.386294,-0.223144
3,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.575364,0.0,0.0,-0.446287,0.0,0.0,0.0,0.693147,0.0,-0.223144,0.0,0.0,0.0,0.0,-0.223144,0.0,-0.223144,1.386294,-0.223144,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,-0.223144


In [111]:
# 한 문서에 여러번 나온 글자의 값이 크다.
# 첫번째 문장에 '동해'라는 단어가 2번 나와서 값이 크다.

# Tensorflow로 BOW 구현하기

In [112]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [113]:
t = Tokenizer()
t.fit_on_texts(docs)
print(t.word_index)

{'무궁화': 1, '삼천리': 2, '화려': 3, '강산': 4, '대한': 5, '사람': 6, '대한으로': 7, '길이': 8, '보전하세': 9, '동해': 10, '싶다': 11, '소나무': 12, '우리': 13, '가을': 14, '하늘': 15, '이': 16, '물과': 17, '백두산이': 18, '마르고': 19, '닳도록': 20, '하느님이': 21, '보우하사': 22, '우리나라': 23, '만세': 24, '가고': 25, '남산': 26, '위에': 27, '저': 28, '철갑을': 29, '두른': 30, '듯': 31, '바람': 32, '서리': 33, '불변함은': 34, '기상일세': 35, '이쁘다': 36, '공활한데': 37, '높고': 38, '구름': 39, '없이': 40, '밝은': 41, '달은': 42, '가슴': 43, '일편단심일세': 44, '보고': 45, '기상과': 46, '마음으로': 47, '충성을': 48, '다하여': 49, '괴로우나': 50, '즐거우나': 51, '나라': 52, '사랑하세': 53, '나라를': 54, '사랑하자': 55}


In [114]:
# 바로 matrix를 만들 수 있음
print(t.texts_to_matrix(docs, mode="count"))  # DTM

[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1.]]


In [115]:
# 2라고 뜨는 곳이 동해임

In [116]:
print(t.texts_to_matrix(docs, mode='tfidf')) # DTM

[[0.         0.58778666 0.58778666 0.58778666 0.58778666 0.58778666
  0.58778666 0.58778666 0.58778666 0.58778666 1.8601123  0.84729786
  0.         0.         0.         0.         0.         1.09861229
  1.09861229 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 1.09861229 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.58778666 0.58778666 0.58778666 0.58778666 0.58778666
  0.58778666 0.58778666 0.58778666 0.58778666 0.         0.
  1.8601123  0.84729786 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 0.    

In [117]:
print(t.texts_to_matrix(docs, mode = 'freq').round(2))

[[0.   0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.1  0.05 0.   0.
  0.   0.   0.   0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.   0.   0.09 0.04
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.04 0.04
  0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.   0.04 0.   0.04
  0.08 0.08 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.04 0.04 0.04 0.04 0.04
  0.04 0.04 0.04 0.04 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.   0.   0.   0.
  0.   0.   0.1  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0

In [120]:
# 빈도수가 분모로

In [118]:
print(t.texts_to_matrix(docs, mode = 'binary').round(2))

[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1.]]


In [119]:
# binary는 있다/없다