## 문장의 표현(Sentence Representation)

## TDM(Term-Documnet Matrix)

In [2]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [3]:
# 띄어쓰기 토큰화
doc_ls = [doc.split() for doc in docs] #split하면 자동적으로 list 생성
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [4]:
# 각 고유 토큰에 인덱스를 지정
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

In [22]:
# DTM 계산
import numpy as np

DTM = np.zeros((len(doc_ls),len(word2id)),dtype=int)
for i,doc in enumerate(doc_ls):
  for token in doc:
    DTM[i,word2id[token]] += 1

DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [27]:
# TF 계산
def computeTF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])

    tf = np.zeros((doc_len,word_len))

    for doc_i in range(doc_len):
      for word_i in range(word_len):
        tf[doc_i, word_i] = DTM[doc_i, word_i]/DTM[doc_i].sum()
    
    return tf

tf = computeTF(DTM)
tf

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

In [39]:
# IDF 계산
import math

def computeIDF(DTM):
  doc_len = len(DTM)
  word_len = len(DTM[0])

  idf = np.zeros(word_len)
  for i in range(word_len):
    idf[i] = -math.log10(np.count_nonzero(DTM[:,i])/doc_len)

  return idf
idf = computeIDF(DTM)
idf

array([ 0.17609126, -0.        ,  0.47712125,  0.17609126,  0.47712125,
        0.47712125,  0.47712125,  0.47712125])

In [41]:
tf.shape

(3, 8)

In [44]:
# tf-idf 계산하기
def computeTFIDF(DTM):
  tf = computeTF(DTM)
  idf = computeIDF(DTM)

  tfidf = np.zeros(tf.shape)
  for doc_i in range(tf.shape[0]):
    for word_i in range(tf.shape[1]):
      tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
  
  return tfidf

tfidf = computeTFIDF(DTM)
tfidf

array([[ 0.04402281, -0.        ,  0.11928031,  0.04402281,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.03521825, -0.        ,  0.        ,  0.0704365 ,  0.09542425,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.09542425,  0.1908485 ,  0.09542425]])

In [59]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
tfidf = computeTFIDF(DTM)
df = pd.DataFrame(tfidf, columns=vocab)
df['문서'] = ['문서0','문서1','문서2']
df.set_index('문서')

Unnamed: 0_level_0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
문서,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
문서0,0.044023,-0.0,0.11928,0.044023,0.0,0.0,0.0,0.0
문서1,0.035218,-0.0,0.0,0.070437,0.095424,0.0,0.0,0.0
문서2,0.0,-0.0,0.0,0.0,0.0,0.095424,0.190849,0.095424


## 유사도 계산

In [62]:
word_embedding_dic = {
'사과' : [1.0, 0.5],
'바나나' : [0.9, 1.2],
'원숭이' : [0.5, 1.5]
}

In [65]:
# 유클리디언 거리 - 두 벡터 사이의 직선 거리
import numpy as np
def euclidean_dist(x,y):
    x = np.array(x)
    y = np.array(y)
    return np.sqrt(np.sum(x-y)**2)

euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['바나나'])

0.6

In [68]:
# 코사인 유사도 - 두 벡터 사이의 코사인을 측정
def cosine_similarity(x,y):
    nominator = np.dot(x,y) #분자
    denominator = np.linalg.norm(x)*np.linalg.norm(y) #분모
    return nominator/denominator

#사과와 바나나의 코사인 유사도
print(cosine_similarity(word_embedding_dic['사과'], word_embedding_dic['바나나']))
print(euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['바나나']))

# 사과와 원숭이의 코사인 유사도
print(cosine_similarity(word_embedding_dic['사과'], word_embedding_dic['원숭이']))
print(euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['원숭이']))

#바나나와 원숭이의 코사인 유사도
print(cosine_similarity(word_embedding_dic['바나나'], word_embedding_dic['원숭이']))
print(euclidean_dist(word_embedding_dic['바나나'], word_embedding_dic['원숭이']))

0.8944271909999159
0.6
0.7071067811865475
0.5
0.9486832980505138
0.09999999999999998


In [70]:
# 자카도 유사도
s1 = '대부분 원숭이는 바나나를 좋아합니다.'
s2 = '코주부 원숭이는 바나나를 싫어합니다.'

token_s1 = s1.split()
token_s2 = s2.split()
union = set(token_s1).union(set(token_s2))
intersection = set(token_s1).intersection(set(token_s2))
print(len(intersection)/len(union))

0.3333333333333333


## 나이브 베이즈 분류

In [71]:
training_set = [['me free lottery', 1],
 ['free get free you', 1],
 ['you free scholarship', 0],
 ['free to contact me', 0],
 ['you won award', 0],
 ['you ticket lottery', 1]]

In [79]:
from collections import defaultdict

# 범주에 속하는 토큰 수 세기 1(스팸), 0 정상
doccnt0 = 0
doccnt1 = 0

# 토큰별로 문서 내 빈도수 카운팅
wordfreq = defaultdict(lambda : [0, 0])
for doc, label in training_set:
    words = doc.split()
    for word in words:
      wordfreq[word][label] += 1
wordfreq

defaultdict(<function __main__.<lambda>>,
            {'award': [1, 0],
             'contact': [1, 0],
             'free': [2, 3],
             'get': [0, 1],
             'lottery': [0, 2],
             'me': [1, 1],
             'scholarship': [1, 0],
             'ticket': [0, 1],
             'to': [1, 0],
             'won': [1, 0],
             'you': [2, 2]})

In [80]:
for key, (cnt0, cnt1) in wordfreq.items():
    doccnt0 += cnt0
    doccnt1 += cnt1

print('doccnt0 : {}'.format(doccnt0))
print('doccnt1 : {}'.format(doccnt1))

doccnt0 : 10
doccnt1 : 10


In [83]:
# 토큰별 조건부 확률 계산
k = 0.5

wordprobs = defaultdict(lambda:[0,0])
for key,(cnt0, cnt1) in wordfreq.items():
  wordprobs[key][0] = (cnt0 + k) / (2 * k + doccnt0)
  wordprobs[key][1] = (cnt1 + k) / (2 * k + doccnt1)

wordprobs

defaultdict(<function __main__.<lambda>>,
            {'award': [0.13636363636363635, 0.045454545454545456],
             'contact': [0.13636363636363635, 0.045454545454545456],
             'free': [0.22727272727272727, 0.3181818181818182],
             'get': [0.045454545454545456, 0.13636363636363635],
             'lottery': [0.045454545454545456, 0.22727272727272727],
             'me': [0.13636363636363635, 0.13636363636363635],
             'scholarship': [0.13636363636363635, 0.045454545454545456],
             'ticket': [0.045454545454545456, 0.13636363636363635],
             'to': [0.13636363636363635, 0.045454545454545456],
             'won': [0.13636363636363635, 0.045454545454545456],
             'you': [0.22727272727272727, 0.22727272727272727]})

In [85]:
# 신규 텍스트가 주어졌을 때 확률 계산
import math
doc = 'free lottery'
tokens = doc.split()

log_prob1 = log_prob0 = 0.0

for word, (prob0, prob1) in wordprobs.items():
  if word in tokens:
    log_prob0 += math.log(prob0)
    log_prob1 += math.log(prob1)

log_prob0 += math.log(doccnt0/ (doccnt0 + doccnt1))
log_prob1 += math.log(doccnt1/ (doccnt0 + doccnt1))

prob0 = math.exp(log_prob0)
prob1 = math.exp(log_prob1)

print(prob0)
print(prob1)

print("정상확률 : {}".format( prob0 /(prob0 + prob1) * 100))
print("스팸확률 : {}".format( prob1 /(prob0 + prob1) * 100))

0.00516528925619835
0.03615702479338842
정상확률 : 12.500000000000009
스팸확률 : 87.49999999999999
