# Dataset

In [1]:
# parser = argparse.ArgumentParser(description='Flickr8k')
# parser.add_argument('-f', default='', type=str)

# parser.add_argument('--embed_size', type=int, default=256, metavar='N', help='embedding size (default: 256)')
# parser.add_argument('--hidden_size', type=int, default=256, metavar='N', help='hidden size (default: 256)')
# parser.add_argument('--batch_size', type=int, default=24, metavar='N', help='batch size (default: 24)')
# parser.add_argument('--lr', type=float, default=2e-3, help='initial learning rate (default: 2e-3)')
# parser.add_argument('--optim', type=str, default='Adam', help='optimizer to use (default: Adam)')
# parser.add_argument('--num_epochs', type=int, default=50, help='number of epochs (default: 50)')
# parser.add_argument('--when_decay', type=int, default=30, help='when to decay learning rate (default: 30)')
# parser.add_argument('--seed', type=int, default=2024, help='random seed')
# parser.add_argument('--num_layers', type=int, default=6, help='number of Transformer decoder layers (default: 6)')
# parser.add_argument('--nhead', type=int, default=8, help='number of heads in the Transformer decoder (default: 8)')
# parser.add_argument('--dim_feedforward', type=int, default=2048, help='dimension of the feedforward network in Transformer (default: 2048)')

import argparse

args = {
    'embed_size': 256,
    'hidden_size': 256,
    'batch_size': 24,
    'lr': 2e-3,
    'optim': 'Adam',
    'num_epochs': 50,
    'when_decay': 30,
    'seed': 2024,
    'num_layers': 6,
    'nhead': 8,
    'dim_feedforward': 2048
}

args = argparse.Namespace(**args)

In [43]:
# dataset.py

import numpy as np
import torch
import os
import pandas as pd
import spacy
from collections import Counter
from torch.utils.data import Dataset
from PIL import Image
from torchvision.transforms import transforms

# 确保先下载 Spacy 模型
# 执行: python -m spacy download en_core_web_sm
spacy_eng = spacy.load("en_core_web_sm")


class Vocabulary:
    def __init__(self, freq_threshold, glove_vocab):
        """
        Args:
            freq_threshold (int): 词汇表中词汇的最低频率
            glove_vocab (set): GloVe词汇表的集合
        """
        # 初始化词汇表，包括特殊标记
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        self.glove_vocab = glove_vocab

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentences):
        idx = 4  # 从索引4开始，因为0-3已经被占用
        frequency = Counter()

        for sentence in sentences:
            for word in self.tokenizer_eng(sentence):
                if word in self.glove_vocab:
                    frequency[word] += 1
                    if frequency[word] == self.freq_threshold:
                        self.itos[idx] = word
                        self.stoi[word] = idx
                        idx += 1

    def numericalize(self, sentence):
        tokenized_text = self.tokenizer_eng(sentence)
        numericalized = []
        for word in tokenized_text:
            if word in self.stoi:
                numericalized.append(self.stoi[word])
            else:
                # 词不在词汇表中，则忽略该词
                continue
        return numericalized


class FlickrDataset(Dataset):
    def __init__(self, 
                 root_dir="/root/mmml-proj/flickr8k/Images", 
                 caption_path="/root/mmml-proj/flickr8k/captions.txt", 
                 glove_path="/root/mmml-proj/glove/glove.6B.300d.txt",
                 freq_threshold=5, 
                 transform=None, 
                 max_length=50):
        """
        Args:
            root_dir (string): 图片所在的目录
            caption_path (string): 包含 captions 的文件路径（应为已划分的train或val的csv）
            glove_path (string): GloVe文件路径
            freq_threshold (int): 词汇表中词汇的最低频率
            transform (callable, optional): 可选的转换函数
            max_length (int): captions 的固定长度
        """
        self.freq_threshold = freq_threshold
        self.transform = transform
        self.root_dir = root_dir
        self.max_length = max_length

        # 读取 captions 文件
        self.df = pd.read_csv(caption_path)

        self.captions = self.df['caption']
        self.images = self.df['image']

        # 加载GloVe词汇表
        self.glove_vocab = self.load_glove_vocab(glove_path)

        # 构建词汇表
        self.vocab = Vocabulary(freq_threshold, self.glove_vocab)
        self.vocab.build_vocabulary(self.captions.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        image = self.images[index]
        img_path = os.path.join(self.root_dir, image)
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        # 数值化 caption，并移除不在词汇表中的词
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        # 截断或填充到固定长度
        if len(numericalized_caption) < self.max_length:
            numericalized_caption += [self.vocab.stoi["<PAD>"]] * (self.max_length - len(numericalized_caption))
        else:
            numericalized_caption = numericalized_caption[:self.max_length]

        return img, torch.tensor(numericalized_caption)

    @staticmethod
    def load_glove_vocab(glove_path):
        """
        加载GloVe词汇表。
        
        Args:
            glove_path (str): GloVe文件路径
        
        Returns:
            set: GloVe词汇表的集合
        """
        print("Loading GloVe vocabulary...")
        glove_vocab = set()
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                glove_vocab.add(word.lower())
        print(f"Loaded {len(glove_vocab)} words from GloVe.")
        return glove_vocab


In [44]:
import torch
import argparse
from torch.utils.data import DataLoader
from torchvision.transforms import transforms
from src.models import CNN2Transformer  # 修改为导入 CNN2Transformer
from src.dataset import FlickrDataset
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

class Collate:
    def __init__(self, pad_value):
        self.pad_value = pad_value
    
    def __call__(self, batch):
        imgs = torch.stack([item[0] for item in batch], dim=0)  # [batch_size, C, H, W]
        captions = torch.stack([item[1] for item in batch], dim=1).permute(1, 0)  # [batch_size, max_length]
        return imgs, captions
    
def get_loader(root_dir="/root/mmml-proj/flickr8k/Images", 
               caption_path="/root/mmml-proj/flickr8k/captions.txt", 
               transform=None, 
               batch_size=48, 
               freq_threshold = 5, 
               num_workers=8, 
               shuffle=True, 
               pin_memory=True, 
               selecting_samples = 10000):
    dataset = FlickrDataset(root_dir=root_dir,caption_path=caption_path, transform=transform, freq_threshold = freq_threshold, selecting_samples = selecting_samples)
    pad_value = dataset.vocab.stoi["<PAD>"]
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=Collate(pad_value), generator=torch.Generator(device='cpu'))
    return loader, dataset

transform = transforms.Compose(
            [
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                        [0.229, 0.224, 0.225]) 
            ]
        )

train_loader, train_set = get_loader(caption_path="/root/mmml-proj/flickr8k/train_captions.csv", transform=transform, freq_threshold=4, selecting_samples='all', batch_size=args.batch_size)
val_loader, val_set = get_loader(caption_path="/root/mmml-proj/flickr8k/val_captions.csv", transform=transform, freq_threshold=4, selecting_samples='all', batch_size=args.batch_size)
loader, dataset = get_loader(transform=transform, freq_threshold=4, selecting_samples='all', batch_size=args.batch_size)

In [54]:
dataset.vocab.stoi

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '<UNK>': 3,
 'a': 4,
 '.': 5,
 'girl': 6,
 'dog': 7,
 'in': 8,
 'the': 9,
 'each': 10,
 'other': 11,
 'little': 12,
 'of': 13,
 'with': 14,
 'on': 15,
 'front': 16,
 'rainbow': 17,
 'is': 18,
 'white': 19,
 'and': 20,
 'black': 21,
 'man': 22,
 'bench': 23,
 'sitting': 24,
 'hat': 25,
 'glasses': 26,
 'an': 27,
 'climbing': 28,
 'at': 29,
 'child': 30,
 'grass': 31,
 'running': 32,
 'wooden': 33,
 'red': 34,
 'to': 35,
 'orange': 36,
 'ball': 37,
 'near': 38,
 'street': 39,
 'boy': 40,
 'him': 41,
 'brown': 42,
 'through': 43,
 'snow': 44,
 'over': 45,
 'wearing': 46,
 'next': 47,
 'wall': 48,
 'rock': 49,
 'are': 50,
 'while': 51,
 'water': 52,
 'playing': 53,
 'catch': 54,
 'yellow': 55,
 'field': 56,
 'toy': 57,
 'large': 58,
 'it': 59,
 'young': 60,
 'green': 61,
 'people': 62,
 'edge': 63,
 ',': 64,
 'by': 65,
 'tree': 66,
 'lake': 67,
 'couple': 68,
 'person': 69,
 'two': 70,
 'ice': 71,
 'dogs': 72,
 'beach': 73,
 'blue': 74,
 'frozen': 75

In [45]:
i=44
train_set.__getitem__(i)[0].shape, train_set.__getitem__(i)[1].shape

(torch.Size([3, 224, 224]), torch.Size([50]))

In [46]:
print(len(next(enumerate(train_loader))[1]))
img_batch = next(enumerate(train_loader))[1][0]
print(img_batch.shape)
caption_batch = next(enumerate(train_loader))[1][1]
print(caption_batch.shape)

2
torch.Size([24, 3, 224, 224])
torch.Size([24, 50])


# Model

In [53]:
import torch
import torch.nn as nn
import torchvision.models as models
import math

class ImageEncoder(nn.Module):
    """使用ResNet18对图像进行编码"""

    def __init__(self, embed_size):
        super(ImageEncoder, self).__init__()
        resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        modules = list(resnet.children())[:-1]  # 移除最后的全连接层
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        """
        Args:
            images: Tensor, shape [batch_size, 3, H, W]
        Returns:
            features: Tensor, shape [batch_size, embed_size]
        """
        with torch.no_grad():
            features = self.resnet(images)  # [batch_size, 512, 1, 1]
        features = features.reshape(features.size(0), -1)  # [batch_size, 512]
        features = self.linear(features)
        # print(features.size())
        features = self.bn(features)  # [batch_size, embed_size]
        return features

img_encoder = ImageEncoder(args.embed_size)
print(img_encoder(img_batch).shape)


torch.Size([24, 256])
torch.Size([24, 256])


In [73]:
class PositionalEncoding(nn.Module):
    """为Transformer添加位置信息的模块"""
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_size]
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embed_size]
        """
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerDecoder(nn.Module):
    """使用Transformer解码器生成文本描述"""

    def __init__(self, embed_size, vocab_size, num_layers, nhead, dim_feedforward, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.embed_size = embed_size

    def forward(self, captions, memory):
        """
        Args:
            captions: Tensor, shape [batch_size, seq_len]
            memory: Tensor, shape [batch_size, embed_size]
        Returns:
            outputs: Tensor, shape [batch_size, seq_len, vocab_size]
        """
        caption_emb = self.embed(captions) * math.sqrt(self.embed_size)  # [batch_size, seq_len, embed_size]
        caption_emb = self.positional_encoding(caption_emb)  # [batch_size, seq_len, embed_size]
        caption_emb = caption_emb.permute(1, 0, 2)  # [seq_len, batch_size, embed_size]
        # 将memory扩展为与Transformer期望的形状一致 [1, batch_size, embed_size]
        memory = memory.unsqueeze(0)

        tgt_mask = self.generate_square_subsequent_mask(caption_emb.size(0)).to(caption_emb.device)
        # (caption_emb.size(), memory.size(), tgt_mask.size())
        outputs = self.transformer_decoder(caption_emb, memory, tgt_mask=tgt_mask)  # [seq_len, batch_size, embed_size]
        outputs = outputs.permute(1, 0, 2)  # [batch_size, seq_len, embed_size]
        outputs = self.fc_out(outputs)  # [batch_size, seq_len, vocab_size]
        return outputs

    def generate_square_subsequent_mask(self, sz):
        """生成自回归的掩码"""
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

# 测试 TransformerDecoder
decoder = TransformerDecoder(args.embed_size, len(dataset.vocab), args.num_layers, args.nhead, args.dim_feedforward)
outputs = decoder(caption_batch, img_encoder(img_batch))
print(outputs.size())
outputs = outputs.reshape(-1, outputs.size(2))  # [(batch_size * (seq_len-1)), vocab_size]
print(f'output:{[dataset.vocab.itos[idx.item()] for idx in outputs.argmax(dim=1)]}')

torch.Size([24, 256])
torch.Size([24, 50, 3432])
output:['stop', 'awning', 'hood', 'hood', 'cowboys', 'hood', 'watery', 'watery', 'hood', 'hood', 'watery', 'hood', 'hood', 'watery', 'watery', 'collar', 'hood', 'hood', 'collar', 'collar', 'collar', 'teaches', 'hood', 'collar', 'teaches', 'collar', 'collar', 'collar', 'hood', 'collar', 'collar', 'hood', 'restaurant', 'forward', 'hood', 'watery', 'hood', 'hood', 'watery', 'hood', 'restaurant', 'restaurant', 'hood', 'collar', 'hood', 'watery', 'watery', 'collar', 'collar', 'collar', 'glass', 'jogging', 'open', 'glass', 'blindfolds', 'glass', 'glass', 'glass', 'glass', 'open', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'enter', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'spring', 

In [49]:
class CNN2Transformer(nn.Module):
    """结合ImageEncoder和TransformerDecoder的完整模型"""

    def __init__(self, embed_size, vocab_size, hidden_size, num_layers, nhead, dim_feedforward, dropout=True):
        super(CNN2Transformer, self).__init__()
        self.encoder = ImageEncoder(embed_size)
        self.decoder = TransformerDecoder(embed_size, vocab_size, num_layers, nhead, dim_feedforward, dropout=0.1 if dropout else 0.0)
        self.embed_size = embed_size
        self.vocab_size = vocab_size

    def forward(self, images, captions):
        """
        前向传播
        Args:
            images: Tensor, shape [batch_size, 3, H, W]
            captions: Tensor, shape [batch_size, seq_len]
        Returns:
            outputs: Tensor, shape [batch_size, seq_len, vocab_size]
        """
        features = self.encoder(images)  # [batch_size, embed_size]
        outputs = self.decoder(captions, features)  # [batch_size, seq_len, vocab_size]
        return outputs

    def captionImage(self, image, vocabulary, max_length=50):
        """生成单张图片的描述"""
        device = image.device
        result = []

        # 编码图像
        with torch.no_grad():
            features = self.encoder(image)  # [1, embed_size]

        # 初始化输入为 <SOS>
        inputs = torch.tensor([vocabulary.stoi["<SOS>"]], dtype=torch.long).to(device)  # [1]
        inputs = inputs.unsqueeze(0)  # [1, 1]

        for _ in range(max_length):
            outputs = self.decoder(inputs, features)  # [1, seq_len, vocab_size]
            outputs = outputs[:, -1, :]  # [1, vocab_size]
            _, predicted = outputs.max(1)  # [1]

            predicted_word = vocabulary.itos[predicted.item()]
            if predicted_word == "<EOS>":
                break
            result.append(predicted_word)
            inputs = torch.cat([inputs, predicted.unsqueeze(0)], dim=1)  # [1, seq_len+1]

        return result

    def captionBatch(self, images, vocabulary, max_length=50):
        """生成批量图片的描述"""
        device = images.device
        batch_size = images.size(0)
        results = [[] for _ in range(batch_size)]

        # 编码图像
        with torch.no_grad():
            features = self.encoder(images)  # [batch_size, embed_size]

        # 初始化输入为 <SOS>
        inputs = torch.tensor([vocabulary.stoi["<SOS>"]] * batch_size, dtype=torch.long).to(device)  # [batch_size]
        inputs = inputs.unsqueeze(1)  # [batch_size, 1]

        for _ in range(max_length):
            outputs = self.decoder(inputs, features)  # [batch_size, seq_len, vocab_size]
            outputs = outputs[:, -1, :]  # [batch_size, vocab_size]
            _, predicted = outputs.max(1)  # [batch_size]

            predicted_words = [vocabulary.itos[p.item()] for p in predicted]
            for i, word in enumerate(predicted_words):
                if word != "<EOS>":
                    results[i].append(word)
            inputs = torch.cat([inputs, predicted.unsqueeze(1)], dim=1)  # [batch_size, seq_len+1]

        return results
    
# 测试模型
print(len(dataset.vocab))
model = CNN2Transformer(args.embed_size, len(dataset.vocab), args.hidden_size, args.num_layers, args.nhead, args.dim_feedforward)
outputs = model(img_batch, caption_batch)
print(outputs.size())

3432
torch.Size([24, 50, 3432])


# Training

In [74]:
def train_and_eval(args, model, 
                  train_loader, val_loader,
                  optimizer, scheduler, filename, vocab):
    criterion = torch.nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])  # 忽略填充项的损失
    best_loss = 1e5
    best_bleu = 0.0
    for epoch in range(args.num_epochs):
        model.train()
        _loss = 0
        _total = 0

        flickr_loader = tqdm(train_loader, desc="Training Epoch {}".format(epoch), total=len(train_loader))
        for imgs, captions in flickr_loader:
            imgs = imgs.to(args.device)
            captions = captions.to(args.device)

            # 前向传播，注意我们传入 captions 的前一部分
            outputs = model(imgs, captions[:, :-1])  # [batch_size, seq_len-1, vocab_size]
            optimizer.zero_grad()

            # 目标是 captions 的后一部分
            targets = captions[:, 1:]  # [batch_size, seq_len-1]

            # 计算损失
            outputs = outputs.reshape(-1, outputs.size(2))  # [(batch_size * (seq_len-1)), vocab_size]
            targets = targets.reshape(-1)  # [(batch_size * (seq_len-1))]
            print(f'targets: {targets}')
            print(f'output:{[dataset.vocab.itos[idx.item()] for idx in outputs.argmax(dim=1)]}')

            loss = criterion(outputs, targets)
            loss.backward()

            _loss += loss.item() * targets.size(0)
            _total += targets.size(0)
            optimizer.step()
            flickr_loader.set_postfix(loss=_loss / _total, lr=optimizer.param_groups[0]['lr'])
        scheduler.step()
        _loss /= _total

        model.eval()
        smoothing_fn = SmoothingFunction().method1
        references, hypotheses = [], []

        with torch.no_grad():
            eval_loader = tqdm(val_loader, desc="Evaluation", total=len(val_loader))
            for imgs, captions in eval_loader:
                imgs = imgs.to(args.device)
                captions = captions.to(args.device)

                generated_captions = model.captionBatch(imgs, vocab)
                for i in range(len(imgs)):
                    ref = captions[i].tolist()
                    references.append([[vocab.itos[idx] for idx in ref if idx not in {vocab.stoi["<SOS>"], vocab.stoi["<EOS>"], vocab.stoi["<PAD>"]}]])
                    hypotheses.append(generated_captions[i])

        # 计算 BLEU 分数
        avg_bleu = corpus_bleu(references, hypotheses, smoothing_function=smoothing_fn)
        print(f"Validation Set Average BLEU Score: {avg_bleu:.4f}")

        print(f"Loss for epoch {epoch}: {_loss}")
        if _loss < best_loss:
            best_loss = _loss
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            }
            save_checkpoint(checkpoint, filename)

def save_checkpoint(state, filename):
    print("saving checkpoint!")
    torch.save(state, filename)

def load_checkpoint(name, model):
    print("loading checkpoint!")
    checkpoint = torch.load(name, map_location=model.device if hasattr(model, 'device') else 'cpu')
    model.load_state_dict(checkpoint["state_dict"])

args.vocab_size = len(dataset.vocab)
print("VocabSize:", args.vocab_size)
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CNN2Transformer(
        embed_size=args.embed_size,
        vocab_size=args.vocab_size,
        hidden_size=args.embed_size,
        num_layers=args.num_layers,
        nhead=args.nhead,
        dim_feedforward=args.dim_feedforward,
        dropout=True
    ).to(device=args.device)

if args.optim.lower() == 'adamw':
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
else:
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.when_decay, 0.1)
filename = "caption_model_lr{}_decay{}_bsize{}.pth.tar".format(args.lr, args.when_decay, args.batch_size)
train_and_eval(args, model, loader, val_loader, optimizer, scheduler, filename, dataset.vocab)

VocabSize: 3432


Training Epoch 0:   0%|          | 1/1686 [00:02<1:06:25,  2.37s/it, loss=8.35, lr=0.002]

torch.Size([24, 256])
targets: tensor([ 9, 21,  7,  ...,  0,  0,  0], device='cuda:0')
output:['?', '?', '?', '?', 'down', '?', '?', 'earth', '?', '?', '?', '?', '?', '?', 'earth', '?', '?', '?', '?', 'to', 'earth', 'raises', '?', 'earth', 'earth', 'earth', 'earth', '?', 'earth', 'earth', '?', 'earth', 'earth', '?', 'earth', 'earth', 'earth', 'earth', '?', '?', 'earth', 'earth', '?', '?', '?', '?', '?', 'earth', 'earth', 'instructor', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'festival', 'streaked', 'camels', 'magazine', 'streaked', 'festival', 'festival', 'streaked', 'streaked', 'festival', 'festival', 'festival', 'festival', 'streaked', 'festival', 'festival', 'balck', 'festival', 'streaked', 'streaked', 'streaked', 'festival', 'festival', 'festival', 'festival', 'streaked', 'bungee', 'streaked', 'festival', 'festival', 'streaked', 'streaked', 'streaked', 'streaked', 'streaked', 'fes

Training Epoch 0:   0%|          | 3/1686 [00:02<18:59,  1.48it/s, loss=6.98, lr=0.002]  

targets: tensor([  4, 530, 301,  ...,   0,   0,   0], device='cuda:0')
output:['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '

Training Epoch 0:   0%|          | 7/1686 [00:02<06:37,  4.23it/s, loss=6.47, lr=0.002]

targets: tensor([174, 121,  32,  ...,   0,   0,   0], device='cuda:0')
output:['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '

Training Epoch 0:   1%|          | 9/1686 [00:02<04:52,  5.73it/s, loss=6.14, lr=0.002]

output:['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '<EOS>', 'a', 'a', 'a', '.', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a'

Training Epoch 0:   1%|          | 11/1686 [00:03<03:55,  7.12it/s, loss=5.99, lr=0.002]

targets: tensor([  4, 112, 780,  ...,   0,   0,   0], device='cuda:0')
output:['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '<EOS>', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '<EOS>', 'a', 'a', '<EOS>', '<EOS>', 'a', 'a', 'a', 'a', 'the', '<EOS>', '<EOS>', '<EOS>', '<EOS>', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'in', 'a', 'a', 'a', 'a', 'a', '<EOS>', 'a', 'a', 'a', 'a', '<EOS>', 'a', 'a', 'a', 'a', '<EOS>', 'a', '<EOS>', 'a', 'in', 'a', '<EOS>', '<EOS>', 'a', 'a', '<EOS>', 'in', 'a', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', 'a', '<EOS>', 'in', '<EOS>', '<EOS>', 'a', 'a', 'a', '<EOS>', 'in', 'a',

Training Epoch 0:   1%|          | 12/1686 [00:03<08:10,  3.41it/s, loss=5.99, lr=0.002]


KeyboardInterrupt: 