In [2]:
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
dataset = 'THUCNews'  # 数据集

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = 'embedding_SougouNews.npz'

model_name = 'TextRCNN'  # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer

from utils import build_dataset, build_iterator, get_time_dif

x = import_module('models.' + model_name)

In [3]:
config = x.Config(dataset, embedding)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

In [5]:
print("Loading data...")
vocab, train_data, dev_data, test_data = build_dataset(config, False)

Loading data...
Vocab size: 4762


180000it [00:01, 114935.30it/s]
10000it [00:00, 131912.53it/s]
10000it [00:00, 51791.44it/s]


In [None]:
embeddings = np.random.rand(len(vocab), 300)
pretrain_dir = "THUCNews/data/sgns.sogou.char"
f = open(pretrain_dir, "r", encoding='UTF-8')
for i, line in enumerate(f.readlines()):
    
    # if i == 0:  # 若第一行是标题，则跳过
    #     continue
    lin = line.strip().split(" ")
    print(lin)
    if lin[0] in vocab:
        idx = vocab[lin[0]]
        emb = [float(x) for x in lin[1:301]]
        embeddings[idx] = np.asarray(emb, dtype='float32')
f.close()

In [1]:
import bz2
import os

def decompress_bz2(file_path):
    # 获取文件的基本路径和名称，用于构造解压后的文件名
    base_path, filename = os.path.split(file_path)
    output_file = os.path.join(base_path, filename[:-4])  # 移除'.bz2'扩展名
    
    with bz2.open(file_path, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            data = f_in.read()
            f_out.write(data)
    
    print(f"File '{file_path}' has been successfully decompressed to '{output_file}'.")

# 指定你的.bz2文件路径
bz2_file_path = 'THUCNews/data/sgns.sogou.char.bz2'

# 调用函数解压文件
decompress_bz2(bz2_file_path)

File 'THUCNews/data/sgns.sogou.char.bz2' has been successfully decompressed to 'THUCNews/data/sgns.sogou.char'.


In [2]:
from utils import build_dataset, build_iterator, get_time_dif, build_vocab
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta
MAX_VOCAB_SIZE = 10000  # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号
'''提取预训练词向量'''
# 下面的目录、文件名按需更改。
train_dir = "Newtrain/data/train.txt"
vocab_dir = "Newtrain/data/vocab.pkl"
pretrain_dir = "THUCNews/data/sgns.sogou.char"
emb_dim = 300
filename_trimmed_dir = "Newtrain/data/embedding_Newtrain"
if os.path.exists(vocab_dir):
    word_to_id = pkl.load(open(vocab_dir, 'rb'))
else:
    # tokenizer = lambda x: x.split(' ')  # 以词为单位构建词表(数据集中词之间以空格隔开)
    tokenizer = lambda x: [y for y in x]  # 以字为单位构建词表
    word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
    pkl.dump(word_to_id, open(vocab_dir, 'wb'))

embeddings = np.random.rand(len(word_to_id), emb_dim)
f = open(pretrain_dir, "r", encoding='UTF-8')
for i, line in enumerate(f.readlines()):
    # if i == 0:  # 若第一行是标题，则跳过
    #     continue
    lin = line.strip().split(" ")
    if lin[0] in word_to_id:
        idx = word_to_id[lin[0]]
        emb = [float(x) for x in lin[1:301]]
        embeddings[idx] = np.asarray(emb, dtype='float32')
f.close()
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

In [22]:
{word_count[0]: idx for idx, word_count in enumerate([('a', 3), ('b', 2), ('c', 1)])}

{'a': 0, 'b': 1, 'c': 2}

In [21]:
msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>5.2%}'
print(msg.format(0.458, 0.89))

Test Loss:  0.46,  Test Acc: 89.00%
