In [None]:
# 安裝 gensim
!pip install gensim

In [2]:
# 匯入套件
from gensim.models import Word2Vec
import jieba

In [12]:
# 讀取文本
documents = []
with open("./cases/colab_test/reviews.txt", "r", encoding="utf-8") as file:
    for line in file:
        document = line.split("\t")[0]
        documents.append(document.replace('"', ''))

# 使用 jieba 進行斷詞
docs = [jieba.lcut(document) for document in documents]

# 設定參數
'''
sg = 1 -> skip-gram
sg = 0 -> cbow
'''
sg = 0 

# 向前看幾個字或向後看幾個字
window_size = 5

# 向量維度
vector_size = 100

# 訓練幾回
epochs = 5

# 最少多少個字才會被使用
min_count = 1

# seed
seed = 42

# 使用幾個 CPU cores 進行訓練
workers = 4

# 建立 Word2Vec 模型
model = Word2Vec(
    docs, 
    vector_size=vector_size, 
    window=window_size, 
    sg=sg, 
    min_count=min_count,
    workers=workers,
    seed=seed,
    epochs=epochs
)

In [None]:
# 取得 "房間" 這個詞的詞向量
vector = model.wv['房間']

# 輸出 "房間" 的詞向量
print(vector)

In [14]:
# 儲存模型
model.save('word2vec.model')

In [15]:
# 讀取模型
loaded_model = Word2Vec.load("word2vec.model")

In [None]:
# 尋找相近的字詞
loaded_model.wv.most_similar('房間', topn=10)

In [None]:
# 計算相近度
loaded_model.wv.similarity('房間', '房子')

# 如何找到合適的 epoch 數量？

In [None]:
from gensim.models import Word2Vec
import jieba
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Microsoft JhengHei')

# 讀取文本
documents = []
with open("./cases/colab_test/reviews.txt", "r", encoding="utf-8") as file:
    for line in file:
        document = line.split("\t")[0]
        documents.append(document.replace('"', ''))

# 使用 jieba 進行斷詞
docs = [jieba.lcut(document) for document in documents]

# 初始化模型，並啟用損失計算
model = Word2Vec(
    vector_size=100,
    window=5,
    sg=0,
    min_count=1,
    workers=4,
    seed=42,
    compute_loss=True
)

# 建立詞彙表
model.build_vocab(docs)

# 定義訓練的 epoch 數
num_epochs = 50

# 用於存儲每個 epoch 的損失值
losses = []

for epoch in range(num_epochs):
    print("=" * 50)
    print(f'訓練第 {epoch+1} 個 epoch')
    model.train(
        docs,
        total_examples=model.corpus_count,
        epochs=1,
        compute_loss=True
    )

    # 取得當前的損失值
    loss = model.get_latest_training_loss()

    # 得到當前的累積損失
    losses.append(loss)
    print('當前總損失值:', loss)
    
    # 重置損失值
    model.running_training_loss = 0

# 繪製損失函數圖表
plt.plot(range(1, num_epochs+1), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Word2Vec 訓練損失函數圖表')
plt.show()