In [2]:
import math
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import pandas as pd
import random
from tqdm import tqdm
import time
import datetime
import copy
import gc

In [3]:
# vacab作成
# テキストを単語で分割
v_start = time.time()
tokenizer = get_tokenizer('basic_english')

# data field定義
TEXT  = data.Field(sequential=True,
                     lower=True,
                     batch_first=True, 
                     tokenize=tokenizer,
                     init_token='<cls>')

print("データ読み込み中")
# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
vocab_data = data.TabularDataset(path ='tweet-transformer/1d/2021-17.csv',
                                       format='csv',
                                       skip_header = True,
                                       fields=[('tweet', TEXT)])
print("データ読み込み完了")

# 単語辞書の作成
TEXT.build_vocab(vocab_data, min_freq=3)
vocab = TEXT.vocab
print(f'{len(vocab)=}')
'''
TEXT.build_vocab(vocab_data, min_freq=10)
vocab = TEXT.vocab
print(f'{len(vocab)=}')
TEXT.build_vocab(vocab_data, min_freq=5)
vocab = TEXT.vocab
print(f'{len(vocab)=}')
TEXT.build_vocab(vocab_data, min_freq=2)
vocab = TEXT.vocab
print(f'{len(vocab)=}')
TEXT.build_vocab(vocab_data, min_freq=1)
vocab = TEXT.vocab
print(f'{len(vocab)=}')
'''

print('辞書作成完了')
print(f'{time.time() - v_start:5.2f} s')

# メモリ開放
del v_start, vocab_data
gc.collect()

データ読み込み中
データ読み込み完了
len(vocab)=230011
len(vocab)=404095
len(vocab)=646245
len(vocab)=1019658
len(vocab)=3415818
辞書作成完了
511.6372814178467 s


3711

In [3]:
'''
# データの前処理
def data_process(timespan):
    train_iter_list = []
    val_iter_list = []
    test_iter_list = []

    klist = {'1d' :211+1,
             '12h':423+1,
             '4h' :1271+1,
             '1h' :5087+1, 
             '30m':10175+1,
             '15m':20351+1,
             '5m' :61055+1}
    num_sections = klist[timespan]
    log_interval = math.ceil(num_sections/100)*10
    counter = 0
    start_time = time.time()
    
    # テキストを単語で分割
    #tokenizer = get_tokenizer('basic_english')

    # data field定義
    SECTION = data.Field(sequential=False,
                         use_vocab=False)


    # CSVファイルを読み込み、TabularDatasetオブジェクトの作成
    k = klist[timespan]
    for i in range(0,k):
        all_dataset = data.TabularDataset(path =f'tweet-transformer/{timespan}/mini-batches/section_{i}.csv',
                                               format='csv',
                                               skip_header = True,
                                               fields=[('tweet', TEXT),
                                                       ('section', SECTION)])

        # trainとtestで分割
        train_val_dataset, test_dataset = all_dataset.split(split_ratio=6/7)
        train_dataset, val_dataset = train_val_dataset.split(split_ratio=5/6, random_state=random.seed(1234))

        # テキストを数値ベクトル化、バッチに分割
        batch_size = 128
        train_iter,val_iter,test_iter = data.BucketIterator.splits((train_dataset,val_dataset,test_dataset),
                                                                   batch_sizes=(batch_size,batch_size,batch_size),
                                                                   sort = False,
                                                                   shuffle=True)
        
        train_iter_list.append(train_iter)
        val_iter_list.append(val_iter)
        test_iter_list.append(test_iter)
        
        # メモリ解放
        del all_dataset,train_dataset,train_iter,val_dataset,val_iter,test_dataset,test_iter
        gc.collect()
        
        counter += 1
        if counter % log_interval == 0 or counter == num_sections:
            print(f'|{counter:5d}/{num_sections:5d} sections | ')
        
    print(f'読み込み完了　　{timespan} : {time.time()-start_time} s')
    
    return train_iter_list, val_iter_list, test_iter_list

In [7]:
# Datasetの定義
class CreateDataset(Dataset):
    def __init__(self, x, y, tokenizer, vacab, max_len):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.x[index]
        text = self.tokenizer(text)
        ids = torch.tensor([self.vocab[word] for word in text], dtype=torch.long) # [seq_len]
        ids = F.pad(ids, (0 ,max_len), "constant", 0) # [max_len]
        mask = [ids==0]

        section = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask,
                'section': torch.Tensor([section])}


In [8]:
# Datasetの作成
def dataset(vocab,timespan):
    max_len = 128
    tokenizer = get_tokenizer('basic_english')

    # CSVファイルを読み込み、データセットを作成
    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17.csv')
    dataset = CreateDataset(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)

    print('dataset作成完了')
    print(f'{len(dataset)=}')
    
    return dataset

In [None]:
# セクションごとにバッチ化 
def section_batch(dataset):
    

In [None]:
# nごとにバッチ化 
def n_batch(sbatches):
    

In [4]:
# parametator for Net
ntokens = len(vocab)  # size of vocabulary
d_model = 512  # embedding dimension
nhead   = 8    # number of heads in nn.MultiheadAttention
d_hid   = 2048  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6    # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2  # dropout probability

In [5]:
# Transformerモデルの概要
class Net(nn.Module):

    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.emb = nn.Embedding(ntoken,
                                d_model,
                                padding_idx=0)
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout)
        encoder_layers = TransformerEncoderLayer(d_model,
                                                 nhead,
                                                 d_hid,
                                                 dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,
                                                      nlayers)
        self.classifer = nn.Linear(d_model,3)
        #self.softmax = nn.Softmax()
        
        self.input_dim = lstm_input_dim
        self.hidden_dim = lstm_hidden_dim
        self.lstm = nn.LSTM(input_size=lstm_input_dim, 
                            hidden_size=lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True
                            )
        
        

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.emb.weight.data.uniform_(-initrange, initrange)
        self.classifer.bias.data.zero_()
        self.classifer.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, src: Tensor) -> Tensor:
        '''
        Args:
            src: Tensor, shape [batch_size, seq_len]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [batch_size, nclass:3]
        '''
# src_key_padding_mask = src_mask
        
#        for i in range(1,num_batches+1)
            embedded = self.emb(src) * math.sqrt(self.d_model)
            pos = self.pos_encoder(embedded)
            encoder_out = self.transformer_encoder(pos)
            x = encoder_out.mean(dim=1)
            output = self.classifer(x)
            
        
        #output = self.softmax(output)
        return output
        

In [6]:
# PositionalEncodingの概要
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        '''
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        '''
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [7]:
# paramator for training & evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(ntokens, d_model, nhead, d_hid, nlayers, dropout).to(device)
lr = 1e-3
softmax = nn.Softmax(dim=1)
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
torch.manual_seed(0)

<torch._C.Generator at 0x245622117b0>

In [8]:
# training
def train(model: nn.Module, train_iter: Tensor):
    train_start_time = time.time()
    model.train()
    num_batches = len(train_iter)
    log_interval = math.ceil(num_batches/30)*10
    batch_counter = 0
    train_loss = 0
    train_correct = 0
    train_count = 0
    
    for idx, batch in enumerate(iter(train_iter)):
        predictions = model(batch.tweet_n.to(device))
        prob = softmax(predictions)
        labels = batch.label.to(device)

        loss = criterion(predictions, labels)
        
        correct = prob.argmax(axis=1) == labels
        acc = correct.sum().item() / correct.size(0)

        train_correct += correct.sum().item()
        train_count += correct.size(0)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        batch_counter += 1
        
        if batch_counter % log_interval == 0 or batch_counter == num_batches:
            lr = scheduler.get_last_lr()[0]
            s_per_batch = (time.time() - train_start_time) / log_interval
            cur_loss = train_loss / log_interval
            cur_acc = train_correct / train_count
            print(f'| epoch {epoch:3d} | {batch_counter:5d}/{num_batches:5d} batches | '
                  f'lr {lr:1.5f} | s/batch {s_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | accuracy {cur_acc:8.2f}')
            total_loss = 0
            train_start_time = time.time()

In [9]:
# evaluation (val, test)
def evaluate(model: nn.Module, eval_iter: Tensor):
    model.eval()
    eval_loss = 0
    eval_correct = 0
    eval_count = 0

    with torch.no_grad():
        for idx, batch in enumerate(iter(eval_iter)):
            predictions = model(batch.tweet_n.to(device))
            prob = softmax(predictions)
            labels = batch.label.to(device)

            loss = criterion(predictions, labels)

            correct = prob.argmax(axis=1) == labels
            acc = correct.sum().item() / correct.size(0)

            eval_correct += correct.sum().item()
            eval_count += correct.size(0)
            eval_loss += loss.item()
        
    print(f'| loss {eval_loss}| accuracy {eval_correct / ecal_count} ')
        
    return eval_loss, eval_correct / eval_count

In [None]:
# main
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = {}
for timespan in tlist:
    print(f'{timespan=} データ読み込み中')
    train_iter_list, val_iter_list, test_iter_list = data_process(timespan)
    
    for n in nlist:
        print(f'{n=}')

        best_val_loss = float('inf')
        epochs = 1
        best_model = None

        dt_start = datetime.datetime.now()
        print(datetime.datetime.now())
        print('学習開始')
        print('-' * 95)

        # training & validation roop
        for epoch in range(1, epochs + 1):
            epoch_start_time = time.time()
            train(model, train_iter_list)
            val_loss, val_acc = evaluate(model, val_iter_list)
            elapsed = time.time() - epoch_start_time
            print('-' * 95)
            print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
                  f'valid loss {val_loss:5.2f} | valid accuracy {val_acc:8.2f}')
            print('-' * 95)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = copy.deepcopy(model)

            scheduler.step()

            del epoch_start_time, elapsed, vall_loss, vall_acc
            gc.collect()

        dt_end = datetime.datetime.now()
        print(datetime.datetime.now())    
        print(f'経過時間：{dt_end - dt_start}')
        print('学習終了')
        
        # test
        test_loss, test_acc = evaluate(best_model, test_iter_list)
        print('=' * 89)
        print(f'| End of training | test loss {test_loss:5.2f} | '
              f'test accuracy {test_acc:8.2f}')
        print('=' * 89)
        
        del best_val_loss,epochs,best_model,test_loss,test_acc,dt_start,dt_end
        gc.collect()
        
    del train_iter_list, val_iter_list, test_iter_list
    gc.collect()


In [33]:
gc.collect()

30

In [20]:
train_ = next(iter(train_iter))
tweet = train_.tweet_n
print(train_)
print(tweet)


[torchtext.legacy.data.batch.Batch of size 128]
	[.tweet_n]:[torch.LongTensor of size 128x61]
	[.label]:[torch.LongTensor of size 128]
tensor([[   2,  821,   58,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,  270,  244,  ...,    1,    1,    1],
        ...,
        [   2,   11,   23,  ...,    1,    1,    1],
        [   2, 1709, 1591,  ...,    1,    1,    1],
        [   2,   18,    6,  ...,    1,    1,    1]])


In [None]:
i=0
#l=len(train_iter)
for idx, batch in enumerate(iter(train_iter)):
    i+=1
    print(idx)
#    print(batch.label)
    print(batch.tweet_n)
    print(batch.tweet_n.size())
    if i==3: break
print("end")

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
train_iter_list, val_iter_list, test_iter_list = data_process('1d')
print(len(train_iter_list))
print(len(val_iter_list))
print(len(test_iter_list))

In [None]:

for i in range(1,10):
    print('-'*90)
    for batch in iter(test_iter_list[i]):
        print(batch)
        print(batch.section)
        print(batch.tweet)
        print(batch.tweet.size())
    if i == 3: break
print("end")


In [25]:
print(len(df_dataset))
print(df_dataset[13140000])

NameError: name 'df_dataset' is not defined

In [37]:
print(len(vocab))
print(vocab['<cls>'])

1019658
2


In [26]:
# テスト用
max_len = 128
tokenizer = get_tokenizer('basic_english')

# CSVファイルを読み込み、データセットを作成
df = pd.read_csv(f'tweet-transformer/1d/2021-17.csv')
print(f'{len(df)=}')
dataset = CreateDataset(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)

print('dataset作成完了')
print(f'{len(dataset)=}')

len(df)=13146832
dataset作成完了
len(dataset)=13146832


In [27]:
gc.collect()

112

In [23]:
x=torch.zeros(1,128)
y=torch.ones(1,128)
z=torch.cat([x,y,x,y],0)
print(z)
print(z.size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [37]:
for i in range(0,10):
#    if i == 0:
    z = dataset[i]['ids']
    print(z.size())
#    else:
#        z = torch.cat([z,dataset[i]['ids']],0)
print(z)
print(z.size())
print(z.ndim)

torch.Size([140])
torch.Size([157])
torch.Size([146])
torch.Size([172])
torch.Size([153])
torch.Size([158])
torch.Size([173])
torch.Size([162])
torch.Size([141])
torch.Size([158])
tensor([  1333,   1345,   3124,   8816,   5406,   8672,      3,  85290,  19615,
             3,  82409,  25114,      3, 105019,  31599,      3, 203741,   5406,
           973,      3, 112693,   5741,      3,  79954,   1209,      3,  97827,
           809,      3,   3494,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,     

In [36]:
z.view(-1, 128)

RuntimeError: shape '[-1, 128]' is invalid for input of size 1560