# 단어의 임베딩
- 빈도수 계산: 빈도기반 -TF 상대빈도
- TDM: 

In [2]:
text = "John likes to watch movies. Mary likes movies too.\
    Mary also likes to watch football games."

In [3]:
words = text.replace('.','').split()
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [4]:
import numpy as np
word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
        'to', 'too', 'watch'], dtype='<U8'),
 array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))

In [5]:
# 단어별 빈도수 딕셔너리 생성
word_to_cnt = {}
for word, cnt in zip (*word_count):
    word_to_cnt[word] = cnt
word_to_cnt

{'John': 1,
 'Mary': 2,
 'also': 1,
 'football': 1,
 'games': 1,
 'likes': 3,
 'movies': 2,
 'to': 2,
 'too': 1,
 'watch': 2}

In [6]:
word_to_cnt['movies']

2

In [7]:
corpus = ["John likes to watch movies. Mary likes movies too.", "Mary also likes to watch football games."]

#  TDM

In [8]:
# 문서별 단어의 빈도수를 계산해서 행렬로 만듬: TDM문서(싸이킷런 사용)
from sklearn.feature_extraction.text import CountVectorizer

vector =CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)


[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [9]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(),
                            key=lambda item: item[1]))
tdm = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
print(tdm)

   also  football  games  john  likes  mary  movies  to  too  watch
0     0         0      0     1      2     1       2   1    1      1
1     1         1      1     0      1     1       0   1    0      1


# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec =TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item:item[1]))
tfidf_dtm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())

print(tfidf_dtm)

       also  football     games      john     likes      mary    movies  \
0  0.000000  0.000000  0.000000  0.323699  0.460629  0.230315  0.647398   
1  0.446101  0.446101  0.446101  0.000000  0.317404  0.317404  0.000000   

         to       too     watch  
0  0.230315  0.323699  0.230315  
1  0.317404  0.000000  0.317404  


In [12]:
from gensim.models import Word2Vec
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
    ]
word_list = []


In [13]:
for word in corpus:
    word_list.append(word.replace('.','').split())
word_list

[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [None]:
# sg=0: CBOW (Continuous Bag-of-Words) 방식
# 주변 단어로 중심 단어를 예측
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.17164471745491028),
 ('also', 0.06594578176736832),
 ('Mary', 0.008838453330099583),
 ('watch', -0.06765829026699066),
 ('games', -0.08544928580522537),
 ('football', -0.08948154747486115),
 ('too', -0.11860241740942001),
 ('to', -0.13643866777420044)]

In [None]:
# sg=1: Skip-gram 방식
# 중심 단어로 주변 단어를 예측
model = Word2Vec(word_list, sg=1, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.17164471745491028),
 ('also', 0.06594578176736832),
 ('Mary', 0.008853347972035408),
 ('watch', -0.06765829026699066),
 ('games', -0.08544928580522537),
 ('football', -0.08948154747486115),
 ('too', -0.11860241740942001),
 ('to', -0.13643862307071686)]