### 1.定义列表读取数据文件

In [1]:
academy_titles = [] 
job_titles = []
with open("academy_titles.txt", encoding="utf-8", mode="r") as f:
    for line in f:
        academy_titles.append(line.strip())

with open("job_titles.txt", encoding="utf-8", mode="r") as f:
    for line in f:
        job_titles.append(line.strip())

In [2]:
academy_titles[:5]

['北师教育学，你我一起努力，让胜利酣畅淋漓。',
 '考博英语词汇',
 '出售人大新闻学院2015年考研权威资料',
 '【脑科院 郭桃梅课题组】科研助理招聘',
 '管理学院的同学帮帮忙呐～']

### 2.统计数据集中出现的字符数量

In [3]:
char_set = set() #定义set集合用于字符去重
for title in academy_titles:
    for ch in title:
        char_set.add(ch)

for title in job_titles:
    for ch in title:
        char_set.add(ch)

print(len(char_set))

1570


### 3.词嵌入表示标题数据

In [4]:
import torch
char_list = list(char_set)
n_chars = len(char_list) + 1 #<UNK>

In [5]:
def title_to_tensor(title):
    tensor = torch.zeros(len(title), dtype=torch.long)
    for li, ch in enumerate(title):
        try:
            ind = char_list.index(ch)
        except ValueError:
            ind = n_chars - 1
        tensor[li] = ind
    return tensor

In [6]:
embedding = torch.nn.Embedding(n_chars, 100)

In [7]:
print(job_titles[1])

招聘兼职/ 笔试考务 /200-300 每人


In [8]:
print(title_to_tensor(job_titles[1]))

tensor([ 979, 1094, 1287,  184,  675, 1218, 1152,  932,  662,  188, 1218,  675,
         357,  145,  145,  804,  251,  145,  145, 1218,  110, 1304])


In [9]:
#print(title_to_tensor(job_titles[1])[1].item())

### 4.字符级别RNN模型

In [10]:
import torch.nn as nn

In [11]:
#word_count 词表大小
#embedding_size 词嵌入维度
#hidden_size 隐藏层维度
#output_size 输出维度

class RNN(nn.Module): #定义模型继承自nn.Module
    def __init__(self, word_count, embedding_size, hidden_size, output_size): #词表容量， 词嵌入维度， 隐藏层大小， 输出大小
        super(RNN, self).__init__() #调用父类的构造函数
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(num_embeddings = word_count, embedding_dim = embedding_size)
        self.i2h = nn.Linear(embedding_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(embedding_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1) #softmax层
        
    def forward(self, input_tensor, hidden):
        #print("input_tensor: ", input_tensor)
        word_vector = self.embedding(input_tensor)
        #print("word_vector: ", word_vector)
        combine = torch.cat([word_vector, hidden], 1)
        hidden = self.i2h(combine) #隐藏层输出
        output = self.i2o(combine) #输出

        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self): #初始化隐藏层的数据
        return torch.zeros(1, self.hidden_size)

### 5.简单运行模型测试

In [12]:
embedding_size = 200
n_hidden = 128
n_categorizes = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn = RNN(n_chars, embedding_size, n_hidden, n_categorizes)

In [13]:
academy_titles[0]

'北师教育学，你我一起努力，让胜利酣畅淋漓。'

In [14]:
input_tensor = title_to_tensor(academy_titles[0])

In [15]:
print("input_tensor:\n", input_tensor)
print(input_tensor[0]) #.unsqueeze(dim=0))
print(input_tensor[0].unsqueeze(dim=0))
print(input_tensor.size())

input_tensor:
 tensor([1464, 1285,  496, 1171, 1260, 1268,  453, 1272, 1537, 1025, 1362,  533,
        1268,  135,  313, 1251, 1392,  111,  427,  741, 1391])
tensor(1464)
tensor([1464])
torch.Size([21])


In [16]:
hidden = rnn.initHidden()
output, hidden = rnn(input_tensor[0].unsqueeze(dim=0), hidden)  #RNN中每次输入一个字符

print("output:\n", output)
print("hidden:\n", hidden)
print("size of hidden:\n", hidden.size())

output:
 tensor([[-0.3105, -1.3208]], grad_fn=<LogSoftmaxBackward>)
hidden:
 tensor([[ 0.2142,  0.7169,  0.2226,  0.1686,  0.5198,  0.6397,  0.0507, -0.1436,
         -0.7081, -0.0714,  0.2588,  0.7722,  0.3533,  0.3136, -0.8421, -0.2427,
         -0.5971, -0.0629, -0.9905, -0.1314,  0.3367,  0.4176,  0.7333, -0.1087,
         -1.0817,  0.0948, -1.0033,  0.9521,  0.1898, -0.0182, -0.3684, -0.0378,
         -0.2235,  0.0752,  0.1389, -0.2527,  0.2792, -0.3506, -0.9622, -0.0330,
          0.2309, -0.2594, -0.2283,  0.0776,  1.0759,  0.4103, -0.4651, -0.7203,
          0.5789, -0.6905,  0.4042, -0.2684, -0.2452, -0.2323, -0.0233, -0.0608,
          0.4656, -0.6943,  0.0180,  1.0367,  0.2002,  0.0664, -0.0664, -0.0419,
         -0.2325,  0.3486, -0.7654,  0.0544,  0.0057,  0.3908, -0.2414,  0.4918,
         -0.2002, -0.5746,  0.0143, -0.5583, -0.6304, -0.7846, -0.2631, -0.2168,
         -0.7741,  0.9337, -0.0883,  0.6030,  0.3038,  0.3621,  0.3514, -0.6306,
          0.6345,  1.1057,  0.05

### 6.数据预处理

In [17]:
#合并数据添加标签

all_data = []
categories = ["考研考博", "招聘信息"]
for l in academy_titles:
    all_data.append((title_to_tensor(l), torch.tensor([0], dtype = torch.long)))  #考研考博 0
for l in job_titles:
    all_data.append((title_to_tensor(l), torch.tensor([1], dtype=torch.long)))  #招聘信息 1

In [18]:
#划分训练集和验证集

import random
random.shuffle(all_data)
data_len = len(all_data)
split_ratio = 0.7

train_data = all_data[:int(data_len * split_ratio)]
test_data = all_data[int(data_len * split_ratio):]

print("size of train_data: ", len(train_data))
print("size of test_data: ", len(test_data))

size of train_data:  4975
size of test_data:  2133


In [19]:
#对于一句话循环使用字符级别的RNN模型

def run_RNN(rnn, input_tensor):
    hidden = rnn.initHidden()
    for i in range(input_tensor.size()[0]):  #tensor.size() -> torch.Size([num]), 添加索引[0]获取数值
        #hidden = hidden.to(device)
        output, hidden = rnn(input_tensor[i].unsqueeze(dim=0), hidden)
    return output

### 7.训练与评估设置

In [20]:
#模型的训练

def train(rnn, criterion,input_tensor, category_tensor):
    rnn.zero_grad()
    output = run_RNN(rnn, input_tensor)
    loss = criterion(output, category_tensor)
    loss.backward()
    
    #根据梯度更新参数
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha = -learning_rate)
    
    return output, loss.item()

In [21]:
#模型的评估

def evaluate(rnn, input_tensor):
    with torch.no_grad():
        hidden = rnn.initHidden()
        output = run_RNN(rnn, input_tensor)
        
        return output

### 8.训练模型

In [22]:
from tqdm import tqdm
epoch = 1
embedding_size = 200
n_hidden = 10
n_categories = 2
learning_rate = 0.005
rnn = RNN(n_chars, embedding_size, n_hidden, n_categories)
criterion = nn.NLLLoss()
loss_sum = 0
all_losses = []
plot_every = 100
for e in range(epoch):
    for ind, (title_tensor, label) in enumerate(tqdm(train_data)):
        output, loss = train(rnn, criterion, title_tensor, label)
        loss_sum += loss
        if ind % plot_every == 0:
            all_losses.append(loss_sum / plot_every)
            loss_sum = 0
    c = 0
    for title, category in tqdm(test_data):
        output = evaluate(rnn, title)
        topn, topi = output.topk(1)
        if topi.item() == category[0].item():
            c += 1
    print('accuracy', c / len(test_data))

100%|██████████████████████████████████████████████████████████████████████████████| 4975/4975 [02:12<00:00, 37.46it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2133/2133 [00:02<00:00, 757.01it/s]

accuracy 0.9854664791373652





### 9.模型的保存与加载

In [23]:
#模型参数的保存
# torch.save(rnn.state_dict(), "rnn_parameter.pkl")
torch.save(rnn, "rnn_model.pkl")

In [24]:
#模型参数的加载(需要重新定义模型)
# embedding_size = 200
# n_hidden = 128
# n_categories = 2
# rnn = RNN(n_chars, embedding_size, n_hidden, n_categories)
# rnn.load_state_dict(torch.load("rnn_parameter.pkl"))
rnn = torch.load("rnn_model.pkl")

In [25]:
#保存词表
import json

with open("char_list.json", mode="w") as f:
    json.dump(char_list, f)

#加载词表
with open("char_list.json", mode="r") as f:
    char_list = json.load(f)

### 10.测试应用

In [26]:
def get_category(title):
    title = title_to_tensor(title)
    output = evaluate(rnn, title)
    topn, topi = output.topk(1)
    return categories[topi.item()]

In [27]:
def print_test(title):
    print((title, get_category(title)))

In [28]:
print_test("考研心得")
print_test("北大实验室博士")
print_test("校外博士招考")
print_test("急求自然语言处理工程师")
print_test("校招offer比较")
print_test("工作还是考研")
print_test("工作吧")
print_test("招聘人员")

('考研心得', '考研考博')
('北大实验室博士', '考研考博')
('校外博士招考', '招聘信息')
('急求自然语言处理工程师', '招聘信息')
('校招offer比较', '招聘信息')
('工作还是考研', '考研考博')
('工作吧', '考研考博')
('招聘人员', '招聘信息')


### 完成