## 处理数据
数据格式：label word word word （以空格作为词与词的分隔） 同时存在很多繁体字，需要进行处理，转成简体字

### 将繁体字转成简体

In [1]:
from opencc import OpenCC
def t2s(file_path,output_file_path):
    """
        file_path：源文件地址
        output_file_path：转换成简体后保存的地址
    """
    input = open(file_path).read()
    output = open(output_file_path,"w")
    output.write(OpenCC("t2s").convert(input))
    output.close()
# t2s("./data/train.txt","./data/train_zh.txt")
# t2s("./data/test.txt","./data/test_zh.txt")
# t2s("./data/validation.txt","./data/validation_zh.txt")

### 构建训练集的词汇表。要加入\<unk\>(因为测试集可能存在unknow词汇)，\<pad\>(用于填充)

In [2]:
from collections import Counter
def build_word_vocab(train_file_path):
    """
        构建训练集的词汇表
    """
    with open(train_file_path) as f:
        lines = f.readlines()
        words = []
        for line in lines:
            text = line.split()[1:]
            words.extend([x for x in text])
    counter = Counter(words)
    # 使用训练集中前99.9%的词汇
    counter = counter.most_common(int(len(counter)*0.999)) # [(word,count),(word,count)]

    words = [word for word,_ in counter] 

    word2idx = {word:index+2 for index,word in enumerate(words)}
    word2idx["<pad>"] = 0
    word2idx["<unk>"] = 1

    idx2word = {index+2:word for index,word in enumerate(words)}
    idx2word[0] = "<pad>"
    idx2word[1] = "<unk>"

    return word2idx,idx2word

word2idx,idx2word = build_word_vocab("./data/train_zh.txt")

## 加载预训练的word2vec模型
进行预处理并保存


In [2]:
from gensim.models import keyedvectors
import torch
w2v=keyedvectors.load_word2vec_format("./data/wiki_word2vec_50.bin",binary=True)
vocab_size = len(word2idx) # 字典里面有多少个词
embedding_dim = w2v.vector_size # embedding之后的维度
# 初始化词向量矩阵，用0初始化。
embedding_weight = torch.zeros(vocab_size,embedding_dim)
for id,word in idx2word.items():
    # 假如该词汇存在于预训练模型中，则直接使用预训练模型中的值替代
    if word in w2v.key_to_index.keys():
        embedding_weight[id] = torch.from_numpy(w2v[word])

torch.save(embedding_weight,"./output/embedding_weight.h5")

In [4]:
import numpy as np
np.save("./output/word2idx.npy",word2idx)

In [5]:
list(word2idx.keys())[-10:]

['交流会', '粗放', '街坊邻居', '某年', '归咎于', '尊有', '两条腿', '刚巧', '<pad>', '<unk>']