### fasttext训练《剑来》文本

#### 文档预处理

In [None]:
import jieba

with open('./data/jianlai.txt', 'r', encoding='utf-8') as f:
    lines = f.read()


with open('./data/jl_sprase_c.txt', 'w', encoding='utf-8') as f:
    for line in lines.split('\n'):
        seg_list = jieba.cut(line, cut_all=False)
        out = ' '.join(seg_list)
        f.write(out + '\n')

#### 模型训练

In [33]:
import fasttext

model = fasttext.train_unsupervised('./data/jl_sprase_c.txt', model='skipgram', epoch = 25, lr = 0.1, dim = 100, ws = 5, minCount = 1, minn = 2, maxn = 6, neg = 5, thread = 4, t = 1e-4, lrUpdateRate = 100)

model.save_model("jl_lw.bin")

In [73]:
# model = fasttext.load_model("./model/jl.bin")

model =  fasttext.load_model("./model/jl.bin")

model.get_nearest_neighbors('齐静春', k=10)

[(0.763984739780426, '瀺'),
 (0.6732342839241028, '齐'),
 (0.6533849239349365, '茅小冬'),
 (0.6208360195159912, '周密'),
 (0.6111604571342468, '春字'),
 (0.594889223575592, '马瞻'),
 (0.5906897187232971, '赵繇'),
 (0.5896353125572205, '圣人'),
 (0.588931143283844, '老秀才'),
 (0.5858778953552246, '老头子')]

#### 计算词汇向量相关度

In [121]:
from fasttext import FastText
import numpy as np
import torch
from torch import nn


In [122]:

model = FastText.load_model('./model/jl_lw.bin')

vocab_size = len(model.words)
embedding_dim = model.get_dimension()

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for i,word in enumerate(model.words):
    embedding_vector = model.get_word_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))



In [123]:
# 检查词汇是否在词典中

# word = '骊珠'
# word = '齐静春'
word = '平安'
word_id = model.get_word_id(word)
word_id


13

In [124]:
# 计算词汇相关度

def cosine_similarity(embedding, word1, word2):
    embed1 = embedding(torch.LongTensor([word1]))
    embed2 = embedding(torch.LongTensor([word2]))
    return nn.functional.cosine_similarity(embed1, embed2).item()

word1 = model.get_word_id('齐静春')
word2 = model.get_word_id('平安')

print(cosine_similarity(embedding, word1, word2))



0.41191810369491577


### Tensorboard词向量可视化

In [125]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()
meta = []
while len(meta) < 100:
    i = len(meta)
    meta += [model.words[i]]

meta = meta[:100]   
writer.add_embedding((embedding_matrix[:100]), metadata=meta)
writer.close()

In [126]:
# tensorboard --logdir=runs