## 安裝及載入gensim下之doc2vec模組

In [10]:
#Gensim是一個開源庫，使用現代統計機器學習來進行無監督的主題建模和自然語言處理
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#DOC2Vec的計算:資料已完成預處理(斷詞完成)

In [11]:
#Doc2Vec的格式需求為(句子(已斷完詞)，句子編碼)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
sentences = [["課程","Python","深度學習", "機器學習"], ["NLP","情緒分析","深度學習", "機器學習"],["深度學習","是","機器學習","一部分","屬於","人工智慧","技術"]]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
documents

[TaggedDocument(words=['課程', 'Python', '深度學習', '機器學習'], tags=[0]),
 TaggedDocument(words=['NLP', '情緒分析', '深度學習', '機器學習'], tags=[1]),
 TaggedDocument(words=['深度學習', '是', '機器學習', '一部分', '屬於', '人工智慧', '技術'], tags=[2])]

In [12]:
#Doc2Vec的訓練
model = Doc2Vec(documents,dm=1,vector_size=5, window=2, min_count=1)
#documents所載入之檔案
#dm=1表PV-DM模式，dm=0表PV-DBOW模式
#vector_size=5表向量維度
#window=2表預測詞與文檔中用來預測上下文詞間的最大距離
#min_count=1表詞語最少出現幾次才納入詞庫



In [13]:
#推論新句子向量:model.infer_vector
vector = model.infer_vector(["深度學習","是","重要","課程"])
print(vector)

[ 0.04226759  0.02966472  0.0922477  -0.06867964 -0.070176  ]


利用已完成斷詞之文字檔: 藺草測試檔.txt

In [14]:
with open("藺草測試檔.txt", 'r') as file1: #將以斷好詞的藺草測試檔.txt讀成file1
  docs = file1.readlines() #設定docs為file1的句子
x_train = [] #設x_train為空陣列
for i, text in enumerate(docs): #在docs的每個句子，取出句子的編號i及句子的內容text
  word_list = text.split(' ') #設word_list為text中分出了的詞語
  a = len(word_list) #設定a為word_list句子的詞語個數
  word_list[a - 1] = word_list[a - 1].strip() #因python由0開始編號，因此最後一個詞語為a-1，而word_list[a - 1].strip()代表刪除最後一個詞語的空格
  x_train.append(word_list) #將word_list的詞語加到x_train
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(x_train)] #將x_train的每個句子標籤化
#documents



In [15]:
model1 = Doc2Vec(documents,dm=0,vector_size=10, window=2, min_count=1)
#documents所載入之檔案
#dm=1表PV-DM模式，dm=0表PV-DBOW模式
#vector_size=20表向量維度
#window=2表預測詞與文檔中用來預測上下文詞間的最大距離
#min_count=1表詞語最少出現幾次才納入詞庫



In [16]:
vector = model1.infer_vector(["藺草","的","故鄉","苑裡"])
vector

array([-0.01066501,  0.0084921 , -0.01998399, -0.02657826, -0.00664044,
       -0.02192825, -0.03749179, -0.01515651,  0.02758062,  0.01014588],
      dtype=float32)

#應用練習題
請利用已訓練完成之句子向量模型，來完成下列問與答之小實作。


In [17]:
import numpy as np
s1 = ["台南","市長","是","哪個","政黨"]
answers = [
  ["今天","天氣","是","晴天"],
  ["台南","市長","是","民進黨"],
  ["我","喜歡","吃","水果"],
  ["我","愛","我","的","故鄉"],
  ["我","就讀","中信"],
  ["藺草","的","故鄉","是","苑裡"]]

s1_vec=model1.infer_vector(s1)
s1_vec

array([-0.00412033, -0.02846085,  0.03618567,  0.01747146,  0.02051522,
       -0.02260793,  0.01856023,  0.03279513, -0.02419791,  0.00764019],
      dtype=float32)

In [18]:
max_sim = -2
for idx,ans in enumerate(answers):
  s2_vec=model1.infer_vector(ans)
  sim = np.dot(s1_vec,s2_vec)/(np.linalg.norm(s1_vec) * np.linalg.norm(s2_vec))
  print("Ans#%d: %f" % (idx+1, sim))
  if sim > max_sim:
    max_idx = idx+1
    max_sim = sim

print("Answer:%d" % max_idx)
print(answers[max_idx-1])


Ans#1: 0.489488
Ans#2: 0.370947
Ans#3: 0.491270
Ans#4: -0.117635
Ans#5: -0.332222
Ans#6: -0.472314
Answer:3
['我', '喜歡', '吃', '水果']


# 加分練習題：請自設問與答的句子，利用自己斷好詞的文本訓練文本向量，寫出問與答語法，讓電腦語法幫你找出正確的答案。