<a href="https://colab.research.google.com/github/wlgus9/DeepLearning_Study/blob/main/1%EC%9E%A5_%EC%9B%8C%EB%93%9C_%EC%9E%84%EB%B2%A0%EB%94%A9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF

Term Frequency  
문서에서 용어가 나타나는 총 횟수

In [None]:
text = "John likes to watch movies. Mary likes movies too.\
        Mary also likes to watch football games."

In [None]:
words = text.replace('.', '').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [None]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))


In [None]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


# TDM

Term Document Matrix  
문서별로 단어의 빈도수 계산 후 행렬 생성

In [None]:
corpus = ["John likes to watch movies. Mary likes movies too.\
           Mary also likes to watch football games."]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[1 1 1 1 3 2 2 2 1 2]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [None]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(),
                            key = lambda  item : item[1]))
tdm = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
print(tdm)

   also  football  games  john  likes  mary  movies  to  too  watch
0     1         1      1     1      3     2       2   2    1      2


# TF-IDF

Term Frequency - Inverse Document Frequency  
* TF  : 문서에서 해당 용어가 발생한 횟수 / 총 용어 수  
* IDF : 전체 문서에서 용어가 얼마나 자주 발생하는지 정의, 가중치 균형을 맞추기 위해 사용 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), 
                        key=lambda item: item[1]))
tfidf_tdm = pd.DataFrame(tfidf_array, 
                         columns=tfidf_dic.keys())
print(tfidf_tdm)

       john     likes        to  ...      also  football     games
0  0.182574  0.182574  0.182574  ...  0.365148  0.182574  0.365148

[1 rows x 10 columns]


# Word2Vec

* 단어를 벡터 형식으로 변환하는 도구  
* 워드 임베딩 모델을 생성하는 데 사용되는 얕은 2계층 신경망

CBOW : 주변 단어 임베딩 -> **중심 단어 예측**  
    * 빠르고 적은 RAM 필요, 하지만 빈도수가 낮은 단어의 처리를 보장하지 않음  
      
Skip-gram : 중심 단어 임베딩 --> **주변 단어 예측**  
    * 빈도수가 낮은 단어에 더 정확

#gensim

In [None]:
corpus = ["John likes to watch movies. Mary likes movies too.\
           Mary also likes to watch football games."]

word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, size=100,
                 window=3, min_count=1)
print(model.wv.most_similar('likes'))
print(model.wv.similarity('movies', 'games'))

[('to', 0.10495354980230331), ('too', 0.05770162492990494), ('games', 0.02830357663333416), ('football', -0.03861209750175476), ('Mary', -0.048340849578380585), ('also', -0.08258099108934402), ('watch', -0.1305607557296753), ('movies', -0.14066770672798157), ('John', -0.14524194598197937)]
0.025130626


#Pre-trained 모델

대용량 코퍼스 데이터를 이용해서 사전에 학습된 모델

In [None]:
model = Word2Vec.load('ko.bin')
print(model.wv.most_similar('인공지능'))

[('컴퓨팅', 0.6520194411277771), ('가상현실', 0.6393702030181885), ('심리학', 0.63037109375), ('모델링', 0.625065267086029), ('신경망', 0.6200424432754517), ('로봇', 0.6109743118286133), ('시뮬레이션', 0.6101070642471313), ('지능', 0.6092983484268188), ('기술', 0.6087720990180969), ('기술인', 0.5957075953483582)]
