In [2]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [4]:
class TransformerModel(nn.Module):

    #Transｆormerモデルの概要
    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.emb = nn.Embedding(ntoken, d_model, padding_idx=0)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.classifer = nn.Linear(d_model, 3)
        #self.logsoftmax = nn.LogSoftmax()

        self.init_weights()


    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.classifer.bias.data.zero_()
        self.classifer.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        
        embedded = self.emb(src) * math.sqrt(self.d_model)
        pos = self.pos_encoder(embedded)
        encoder_out = self.transformer_encoder(pos, src_mask)
        x = encoder_out.mean(dim=1)
        output = self.classifer(x)
        #output = self.logsoftmax(output)
        return output
        

In [None]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [8]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])


def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)


'\n# train_iter was "consumed" by the process of building the vocab,\n# so we have to create it again\ntrain_iter, val_iter, test_iter = WikiText2()\ntrain_data = data_process(train_iter)\nval_data = data_process(val_iter)\ntest_data = data_process(test_iter)\n'

In [30]:
from torchtext import data, datasets
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd

tokenizer = get_tokenizer('basic_english')

TEXT  = data.Field(tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False)

# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
train, test = data.TabularDataset.splits(path='tweet-of-btc',
                                         train='2021-01_tlist.csv',
                                         test='2021-07_tlist.csv',
                                         format='csv',
                                         fields=[('Tweet', TEXT), ('Favorite', LABEL)])


print('訓練データの数', len(train))
print('1つ目の訓練データ', vars(train[0]))
print(vars(train[0])['Tweet'])
print(vars(train[0])['Favorite'])

訓練データの数 1571375
1つ目の訓練データ {'Tweet': ['tweet'], 'Favorite': 'Favorite'}
['tweet']
Favorite


In [19]:
s= 'I I , a pen.'
s=torch.tensor(vocab(tokenizer(s)), dtype=torch.long)
print(s)
#print(torch.cat(tuple(filter(lambda t: t.numel() > 0, s))))

tensor([   64,    64,     2,     8, 10633,     3])


In [34]:
print(vars(train[1])['Tweet'])

['price', 'update', '1', '#bitcoin', '=', '$28', ',', '972', '.', '94', '📉', '#cryptocurrency', '$btc', 'source', 'https', '//t', '.', 'co/rhm8c4mrrz']
