In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import torch
import torch.nn as nn
import torch.nn.functional as F
# 设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

/kaggle/input/chinese-couplets/couplet/vocabs
/kaggle/input/chinese-couplets/couplet/test/out.txt
/kaggle/input/chinese-couplets/couplet/test/in.txt
/kaggle/input/chinese-couplets/couplet/test/.in.txt.swp
/kaggle/input/chinese-couplets/couplet/test/.out.txt.swp
/kaggle/input/chinese-couplets/couplet/train/out.txt
/kaggle/input/chinese-couplets/couplet/train/in.txt
cuda


In [2]:
# 读取文件，每行以空格分隔
def load_lines(filename='vocabs'):
    with open(f'/kaggle/input/chinese-couplets/couplet/{filename}', 'r') as f:
        lines = []
        line = f.readline()
        while line:
            words = line.strip().split(' ')
            lines.append(words)
            line = f.readline()
    print(filename, len(lines))
    return lines

from torch.utils.data import DataLoader
# 词汇表
all_tokens = [i[0] for i in load_lines()]
vocabs = {token: index for index, token in enumerate(all_tokens)}

BOS = 0
EOS = 1
batch_size = 32
MAX_LENGTH = 10
train_data_size = 10000

def collate(batch):
    enc_input = []
    dec_output = []
    for x, y in batch:
        enc_input.append(torch.tensor(x, device=device))
        dec_output.append(torch.tensor(y, device=device))
    enc_input = torch.nn.utils.rnn.pad_sequence(enc_input, batch_first=True)
    dec_output = torch.nn.utils.rnn.pad_sequence(dec_output, batch_first=True)
    return enc_input, dec_output
    

# 训练集
data = [([*(vocabs.get(i) for i in x), EOS], [*(vocabs.get(j) for j in y), EOS]) for x, y in zip(
    load_lines('train/in.txt')[:train_data_size], load_lines('train/out.txt')[:train_data_size]
)]
print(data[-1])
dl = DataLoader(data, batch_size=batch_size, shuffle=True, collate_fn=collate)
# 测试集
test_data = [([*(vocabs.get(i) for i in x), EOS], [*(vocabs.get(j) for j in y), EOS]) for x, y in zip(
    load_lines('test/in.txt'), load_lines('test/out.txt')
)]
print(test_data[-1])
test_dl = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate)


vocabs 9130
train/in.txt 770491
train/out.txt 770491
([1014, 17, 909, 191, 403, 1], [584, 174, 431, 85, 284, 1])
test/in.txt 4000
test/out.txt 4000
([21, 380, 463, 11, 3, 309, 2643, 446, 40, 46, 44, 630, 1], [14, 600, 869, 244, 3, 524, 494, 1337, 54, 188, 588, 401, 1])


In [3]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(BOS)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        step_size = encoder_outputs.size(1)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(BOS)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(step_size):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [4]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [5]:
import random

def evaluate(encoder, decoder, input_tensor):
    with torch.no_grad():

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS:
                decoded_words.append('</s>')
                break
            decoded_words.append(all_tokens[idx.item()])
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=20):
    for i in range(n):
        pair = random.choice(test_data)
        print('>', ' '.join([all_tokens[j] for j in pair[0]]))
        print('=', ' '.join([all_tokens[j] for j in pair[1]]))
        output_words, _ = evaluate(encoder, decoder, torch.tensor(pair[0], device=device).view(1, -1))
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [6]:
hidden_size = 128

encoder = EncoderRNN(len(vocabs), hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, len(vocabs)).to(device)

train(dl, encoder, decoder, 80, print_every=5, plot_every=5)

1m 15s (- 18m 45s) (5 6%) 2.7797
2m 25s (- 16m 59s) (10 12%) 2.2753
3m 36s (- 15m 36s) (15 18%) 1.9477
4m 46s (- 14m 19s) (20 25%) 1.7189
5m 58s (- 13m 9s) (25 31%) 1.5348
7m 10s (- 11m 57s) (30 37%) 1.3829
8m 20s (- 10m 43s) (35 43%) 1.2606
9m 32s (- 9m 32s) (40 50%) 1.1575
10m 43s (- 8m 20s) (45 56%) 1.0695
11m 54s (- 7m 8s) (50 62%) 0.9956
13m 4s (- 5m 56s) (55 68%) 0.9256
14m 14s (- 4m 44s) (60 75%) 0.8681
15m 25s (- 3m 33s) (65 81%) 0.8151
16m 35s (- 2m 22s) (70 87%) 0.7667
17m 45s (- 1m 11s) (75 93%) 0.7249
18m 56s (- 0m 0s) (80 100%) 0.6884


In [7]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> 春 融 北 国 千 山 雪 </s>
= 秋 染 南 阳 万 树 枫 </s>
< 水 改 江 城 人 面 花 </s>

> 于 夫 下 海 网 张 也 </s>
= 姜 女 攻 城 墙 倒 乎 </s>
< 或 者 冲 开 眼 界 宽 </s>

> 空 庭 草 色 和 烟 暖 </s>
= 午 夜 书 声 带 月 寒 </s>
< 雁 字 荷 香 好 壁 风 </s>

> 鞭 尘 辞 虎 岁 </s>
= 玉 兔 贺 新 春 </s>
< 人 间 问 国 新 </s>

> 杯 小 但 容 天 地 事 </s>
= 舟 轻 可 载 古 今 愁 </s>
< 学 过 不 用 大 波 心 </s>

> 屈 平 槁 头 张 恨 水 </s>
= 子 期 焦 尾 谢 冰 心 </s>
< 悬 壸 济 世 英 雄 风 </s>

> 男 女 情 深 成 一 对 </s>
= 夫 妻 义 重 敬 双 亲 </s>
< 亲 句 句 题 始 启 三 分

> 山 门 外 三 脚 驴 子 </s>
= 蒲 团 上 一 块 兜 楼 </s>
< 木 铜 独 三 春 来 </s>

> 望 断 残 亭 归 雁 远 </s>
= 吟 成 落 日 白 帆 斜 </s>
< 遥 云 漫 岭 幽 幽 </s>

> 正 能 量 沸 腾 神 州 大 地 </s>
= 中 国 梦 温 馨 枞 邑 人 家 </s>
< 齐 衰 扫 王 庆 海 天 </s>

> 佳 人 纤 手 心 如 玉 </s>
= 秀 士 巧 舌 口 若 河 </s>
< 古 今 古 老 小 康 花 </s>

> 凤 舞 千 家 瑞 </s>
= 梅 开 四 海 香 </s>
< 人 间 一 点 春 </s>

> 一 树 新 枝 依 老 节 </s>
= 满 山 翠 竹 指 高 天 </s>
< 三 围 再 涌 自 黄 昏 </s>

> 擦 拳 磨 掌 ， 拍 案 击 节 研 三 国 </s>
= 两 小 无 猜 ， 花 前 月 下 读 西 厢 </s>
< 以 和 谐 雪 耻 ， 碧 血 乌 江 打 千 秋

> 最 是 无 缘 ， 情 归 尽 处 天 垂 泪 </s>
= 莫 非 有 约 ？ 梦 醒 来 时 帘 卷 风 </s>
< 总 牵 知 己 ， 勤 到 其 中 众 遍 间 </s>

> 开 窗 