### 1. BoW 구현

In [1]:
from collections import Counter
bow = ["나는", "오늘", "파이썬", "을", "공부", "하고", "있습니다", "파이썬", "은", "정말", "재밌는", "공부", "입니다"]
bow_dic = dict(Counter(bow))
print(bow_dic)

{'나는': 1, '오늘': 1, '파이썬': 2, '을': 1, '공부': 2, '하고': 1, '있습니다': 1, '은': 1, '정말': 1, '재밌는': 1, '입니다': 1}


### 2. DTM 구현

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# 문서 리스트
documents = [
    "나는 영어를 좋아합니다",
    "영어, 수학을 매일 공부합니다",
    "과학 공부를 좋아하고 수학도 좋아합니다",
    "영어, 수학, 과학을 좋아하고 영어는 매일 공부합니다."
]

# CountVectorizer 초기화 및 DTM 생성
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(documents)

# DTM과 어휘 사전 출력
print("DTM:")
print(dtm.toarray())
print("\n어휘 사전:")
print(vectorizer.get_feature_names_out())


DTM:
[[0 0 0 0 1 0 0 0 0 0 0 1 0 1]
 [0 1 0 0 0 1 0 0 1 1 0 0 0 0]
 [1 0 1 0 0 0 0 1 0 0 0 0 1 1]
 [0 1 0 1 0 1 1 0 0 1 1 0 1 0]]

어휘 사전:
['공부를' '공부합니다' '과학' '과학을' '나는' '매일' '수학' '수학도' '수학을' '영어' '영어는' '영어를'
 '좋아하고' '좋아합니다']


### 3. TF-IDF 구현

In [3]:
# 필요한 라이브러리 임포트
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 문서 리스트
documents = [
    "나는 영어를 좋아합니다",
    "영어, 수학을 매일 공부합니다",
    "과학 공부를 좋아하고 수학도 좋아합니다",
    "영어, 수학, 과학을 좋아하고 영어는 매일 공부합니다."
]

# TF-IDF Vectorizer 초기화 및 학습
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# 어휘 사전과 TF-IDF 행렬을 데이터 프레임으로 변환
# 문서의 인덱스를 "문서1", "문서2", ...으로 설정
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=tfidf_vectorizer.get_feature_names_out(), 
                        index=[f"문서{i+1}" for i in range(len(documents))])
dtm = vectorizer.fit_transform(documents)
print(dtm.toarray())
print("\n어휘 사전:")
print(vectorizer.get_feature_names_out())

[[0 0 0 0 1 0 0 0 0 0 0 1 0 1]
 [0 1 0 0 0 1 0 0 1 1 0 0 0 0]
 [1 0 1 0 0 0 0 1 0 0 0 0 1 1]
 [0 1 0 1 0 1 1 0 0 1 1 0 1 0]]

어휘 사전:
['공부를' '공부합니다' '과학' '과학을' '나는' '매일' '수학' '수학도' '수학을' '영어' '영어는' '영어를'
 '좋아하고' '좋아합니다']


In [35]:
df_tfidf

Unnamed: 0,공부를,공부합니다,과학,과학을,나는,매일,수학,수학도,수학을,영어,영어는,영어를,좋아하고,좋아합니다
문서1,0.0,0.0,0.0,0.0,0.617614,0.0,0.0,0.0,0.0,0.0,0.0,0.617614,0.0,0.486934
문서2,0.0,0.465809,0.0,0.0,0.0,0.465809,0.0,0.0,0.590819,0.465809,0.0,0.0,0.0,0.0
문서3,0.485461,0.0,0.485461,0.0,0.0,0.0,0.0,0.485461,0.0,0.0,0.0,0.0,0.382743,0.382743
문서4,0.0,0.336597,0.0,0.426931,0.0,0.336597,0.426931,0.0,0.0,0.336597,0.426931,0.0,0.336597,0.0


### 4. Word2Vec

In [4]:
# !pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.1-py3-none-any.whl.metadata (23 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.16.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
    --------------------------------------- 0.3/24.0 MB 9.3 MB/s eta 0:00:03
   - -------------------------------------- 1.0/24.0 MB 10.9 MB/s eta 0:00:03
   --- ------------------------------------ 2.2/24.0 MB 15.7 MB/s eta 0:00:02
   ------ --------------------------------- 3.8/24.0 MB 19.9 MB/s eta 0:00:02
   --------- ------------------------------ 5.6/24.0 MB 23.7 MB/s eta 0:00:01
   ------------- -------------------------- 8.1/24.0 MB 28.5 MB/s eta 0:00:01
   ------------------ --------------------- 10.8/24.0 MB 38.5 MB/s eta 0:00:01
   --

In [20]:
documents = [
    ["나는", "영어", "좋아합니다"],
    ["영어", "수학", "매일", "공부합니다"],
    ["과학", "공부", "좋아", "수학", "좋아합니다"],
    ["영어", "수학", "과학", "좋아", "영어", "매일", "공부합니다"]
]

In [21]:
from gensim.models import Word2Vec

# Word2Vec 모델 학습
model = Word2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# 학습된 모델 저장
model.save("word2vec.model")

In [22]:
model.wv.key_to_index

{'영어': 0,
 '수학': 1,
 '좋아': 2,
 '과학': 3,
 '공부합니다': 4,
 '매일': 5,
 '좋아합니다': 6,
 '공부': 7,
 '나는': 8}

In [27]:
# 단어 벡터 얻기
vector = model.wv['수학']

# 가장 유사한 단어 찾기
similar_words = model.wv.most_similar('영어')
for word in similar_words:
    print(word)

('공부', 0.09291721880435944)
('나는', 0.027057476341724396)
('좋아합니다', 0.016134683042764664)
('수학', -0.010839177295565605)
('공부합니다', -0.027737075462937355)
('좋아', -0.05237111821770668)
('매일', -0.059876296669244766)
('과학', -0.11167220771312714)


In [28]:
model.wv.most_similar('영어')[0]

('공부', 0.09291721880435944)

### 5. 코사인 유사도

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

### 문장2와 문장4의 유사도 계산
vector_1 = tfidf_matrix.toarray()[1]  # 2 번째 문서
vector_2 = tfidf_matrix.toarray()[3]  # 4 번째 문서

cos_sim_result = cosine_similarity([vector_1], [vector_2])
cos_sim_result

array([[0.47036942]])

### 6. 유클리드 거리

In [48]:
from scipy.spatial.distance import euclidean, jaccard
eucl_dist = euclidean(vector_1, vector_2)
eucl_dist

1.0292041393952769

### 7. 자카드 유사도

In [50]:
import numpy as np

vector_a_binary = np.where(vector_1 > 0, 1, 0)
vector_b_binary = np.where(vector_2 > 0, 1, 0)
jacc_sim = 1 - jaccard(vector_a_binary, vector_b_binary)  # 1에서 거리를 빼서 유사도를 계산
jacc_sim

0.375