使用fasttext中的无监督学习来训练word2vec词向量模型

In [1]:
import jieba
import pandas as pd
import fasttext
from gensim.models import Word2Vec

In [8]:
# 数据预处理
with open(r'F:\NLP算法课程\正式课\0319\语言模型及词向量相关知识\cn_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f)

with open(r'F:\NLP算法课程\正式课\0319\语言模型及词向量相关知识\浪潮之巅.txt', 'r', encoding='utf-8') as f:
    content = f.read()

words = jieba.lcut(content) # 得到分完词的list
# 添加停用词和中文过滤
filtered = [w for w in words 
            if w not in stopwords 
            and len(w) > 1  # 新增长度过滤
            and '\u4e00' <= w <= '\u9fff']  # 仅保留中文

# 保存分词结果
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for sentence in filtered:
        f.write(' '.join(sentence) + '\n')  # 将分词结果转换为空格分隔格式


In [15]:
# fasttext
model_fasttext = fasttext.train_unsupervised(
    input='corpus.txt',        # 预处理后的文本文件（分词后的语料）
    model='skipgram',          # 选择模式：skipgram 或 cbow
    dim=300,                   # 词向量维度
    ws=5,                      # 上下文窗口大小
    minn=3,                    # 最小字符n-gram
    maxn=6,                    # 最大字符n-gram
    epoch=50                   # 训练轮次
)
# 保存模型
model_fasttext.save_model('fasttext.bin')

model_w2v = Word2Vec(filtered, vector_size=100, window=5, min_count=1, workers=4)
model_w2v.save('word2vec.bin')

print('fasttext:',model_fasttext.get_word_vector("人工智能")) 
print('fasttext：',model_fasttext.get_nearest_neighbors("人工智能")) 

# print('Word2Vec向量:', model_w2v.wv['数学'])  # 获取词向量，如果是未登录词会报错
# print('Word2Vec最近邻:', model_w2v.wv.most_similar('数学', topn=5))  # 取前5个相似词

fasttext: [-3.81984748e-04 -5.31082449e-04  1.20665114e-04  3.79996723e-04
  2.20582824e-05 -9.33811476e-04 -2.10810380e-04  5.34110295e-04
  3.36948375e-04  4.43096360e-04  5.20702451e-04  3.91902373e-04
  5.15083957e-04 -5.00595430e-04  9.17334837e-05 -6.30763301e-04
 -1.35641210e-04  3.56432982e-04 -3.04077199e-04  8.48774507e-04
 -2.28332196e-04  1.78923394e-04 -1.54820955e-04 -2.44773109e-04
  7.36900314e-04  1.29717009e-04 -5.59824926e-04  5.01744275e-04
 -8.06300144e-04  3.06621718e-04 -9.73400074e-06 -4.56485441e-06
  9.63647653e-06 -2.41475645e-04 -1.03469545e-04 -4.18703072e-04
 -1.13324357e-04  9.82562895e-04 -8.52955054e-05 -1.25719653e-03
  4.72889769e-05  6.64324034e-05 -5.75779879e-04 -6.49228969e-05
  6.99132041e-04 -2.24352159e-04 -7.07470346e-04  3.67770059e-04
  9.29063477e-04  1.78462156e-04 -1.02151673e-04 -1.93125481e-04
  5.65471419e-04  5.32329665e-04 -1.84248674e-05 -2.52920290e-04
  1.05179125e-03  1.72848464e-04 -9.63172584e-04 -3.04652436e-04
  9.87702748e-0

In [3]:
import numpy as np
from tensorboard.plugins import projector

# 加载模型并获取目标词信息
model = fasttext.load_model("fasttext.bin")
target_word = "数学"

# 构建专属数据集
related_words = [target_word]
vectors = np.array([model.get_word_vector(target_word)])  # 单向量处理

# 修改metadata生成
with open('metadata.tsv', 'w', encoding='utf-8') as f:
    f.write("Word\n") 
    f.write(f"{target_word}\n")  # 仅写入目标词

# 保存向量和配置
np.save('vectors.npy', vectors)
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'vectors.npy'
embedding.metadata_path = 'metadata.tsv'


import os
os.makedirs('word_vectors', exist_ok=True)  # 确保目录存在
# 在调用 visualize_embeddings 之前添加文件同步（重要）
import shutil
shutil.copy('vectors.npy', 'word_vectors/') 
shutil.copy('metadata.tsv', 'word_vectors/')

# 原代码保持不变
projector.visualize_embeddings('word_vectors', config)

# 启动TensorBoard
projector.visualize_embeddings('word_vectors', config)
