In [1]:
import math
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import pandas as pd
import random
from tqdm import tqdm
import time
import datetime
import copy
import gc

In [2]:
# vacab作成
# テキストを単語で分割
v_start = time.time()
tokenizer = get_tokenizer('basic_english')

# data field定義
TEXT  = data.Field(sequential=True,
                     lower=True,
                     batch_first=True, 
                     tokenize=tokenizer,
                     init_token='<cls>')

print("Reading...")
# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
vocab_data = data.TabularDataset(path ='tweet-transformer/1d/2021-17.csv',
                                       format='csv',
                                       skip_header = True,
                                       fields=[('tweet', TEXT)])
print("Creating vocab...")

# 単語辞書の作成
TEXT.build_vocab(vocab_data, min_freq=3)
vocab = TEXT.vocab
print(f'{len(vocab)=}')

print('Finish!!')
print(f'{time.time() - v_start:5.2f} s')

# メモリ開放
del v_start, vocab_data, tokenizer, TEXT
gc.collect()

Reading...
Creating vocab...
len(vocab)=646245
Finish!!
336.00 s


2604

In [6]:
# Dataset1の定義
class CreateDataset1(Dataset):
    def __init__(self, x, y, tokenizer, vocab, max_len):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_len = max_len

    # len(Dataset)で返す値を指定    
    def __len__(self):
        return len(self.y)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        text = self.x[index]
        text = self.tokenizer(text)
        ids  = torch.tensor([self.vocab[word] for word in text], dtype=torch.long) # [seq_len]
        ids  = F.pad(ids, (0 ,self.max_len-len(text)), "constant", 0) # [max_len]
        mask = (ids==0)

        section = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask,
                'section': torch.Tensor([section])}

In [7]:
# Dataset1をセクションごとにリスト分割
def separate_section(dataset):
    ids_list  = []
    mask_list = []
    start_time = time.time()
    counter = -1
    
    for k in range(0, len(dataset)):
        i = dataset[k]['ids'].squeeze()
        m = dataset[k]['mask'].squeeze()
            
        if counter != dataset[k]['section']:
            ids_list.append([i])
            mask_list.append([m])
            counter += 1
        else:
            ids_list[counter].append(i)
            mask_list[counter].append(m)
            
        if k%1000000==0 and k!=0:
            print(f'| 現在 {k:8d}件 終了 | 経過時間 {time.time()-start_time:6.2f} s |')
        if k==3000000:break
    
    print(f'{len(ids_list)=}')
    print(f'{len(mask_list)=}')
    
    del i, m, k, start_time, counter
    gc.collect()
    
    return ids_list, mask_list

In [8]:
# Dataset2の定義
class CreateDataset2(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        ids  = self.x[index]
        mask = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask}

In [9]:
# Datasetの作成 (ツイート)
# 1. CreateDataset1
# 2. separate_section
# 3. CreateDataset2
def data_process(vocab,timespan):
    max_len = 128
    tokenizer = get_tokenizer('basic_english')

    print('Reading...')
    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17.csv')
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)
    print('Creating Dataset1...')
    dataset = CreateDataset1(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)
    print('Separating Section...')
    ids_list, mask_list = separate_section(dataset)
    
    print('Creating Dataset2...')
    dataset_list = []
    for i in range(0, len(ids_list)):
        x = CreateDataset2(ids_list[i], mask_list[i])
        dataset_list.append(x)
    
    print('Finish!!')
    print(f'{len(dataset_list)=}')
    
    del max_len, tokenizer, df, ids_list, mask_list, dataset, x, i
    gc.collect()
    
    return dataset_list

In [4]:
# parametator for Net
ntokens = len(vocab)  # size of vocabulary
d_model = 512  # embedding dimension
nhead   = 8    # number of heads in nn.MultiheadAttention
d_hid   = 2048  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6    # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2  # dropout probability

In [5]:
# Transformerモデルの概要
class Net(nn.Module):

    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.embedding = nn.Embedding(ntoken,
                                d_model,
                                padding_idx=0)
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout)
        encoder_layers = TransformerEncoderLayer(d_model,
                                                 nhead,
                                                 d_hid,
                                                 dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,
                                                      nlayers)
        self.dense1 = nn.Linear(d_model,3)
        self.softmax = nn.Softmax(dim=1)
        
        self.input_dim = lstm_input_dim
        self.hidden_dim = lstm_hidden_dim
        self.lstm = nn.LSTM(input_size=lstm_input_dim, 
                            hidden_size=lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True)
        self.dense2 = nn.Linear(lstm_hidden_dim,3)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.dense.bias.data.zero_()
        self.dense.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, train_list_n, ) -> Tensor:
        '''
        Args:
            src: Tensor, shape [batch_size, seq_len]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [batch_size, nclass:3]
        '''
        p_section = []
        
        # out : list of tesnsor[neg,neu,pos]
        for j in range(0,len(train_list_n)):
            p_batch = []
            batches = DataLoader(train_list_n[j], batch_size=1024, shuffle=True)
            
            # out : list of tensor[batch_size,3]
            for batch in batches:
                ids  =  bacth['ids'].to(device)
                mask =  batch['mask'].to(device)
                x = self.embedding(ids) * math.sqrt(self.d_model)
                x = self.pos_encoder(x)
                x = self.transformer_encoder(pos, src_key_padding_mask=mask)
                x = x.mean(dim=1)
                x = self.dense1(x)
                x = self.softmax(x)
                p_batch.append(x)
                
            x = torch.cat(p_batch, dim=0)
            x = x.sum(dim=0)
            p_section.append(x)
        
        
        _, y = self.lstm(inlist)
        y = self.dense2(y[0].view(inlist.size(0), -1))
        #
        return output
        

In [6]:
# PositionalEncodingの概要
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        '''
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        '''
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
# paramator for training & evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
softmax = nn.Softmax(dim=1)
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
torch.manual_seed(0)

In [8]:
# training
def train(model, train_list, n):
    train_start_time = time.time()
    model.train()
    num_batches = len(train_iter)
    log_interval = math.ceil(num_batches/30)*10
    batch_counter = 0
    train_loss = 0
    train_correct = 0
    train_count = 0
    
    for i in range(0, len(train_list)-n+1):
        train_list_n = train_list[i:i+n]
        
        predictions = model(train_list_n)
        prob = softmax(predictions)
        labels = batch.label.to(device)

        loss = criterion(predictions, labels)
        
        correct = prob.argmax(axis=1) == labels
        acc = correct.sum().item() / correct.size(0)

        train_correct += correct.sum().item()
        train_count += correct.size(0)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        batch_counter += 1
        
        if batch_counter % log_interval == 0 or batch_counter == num_batches:
            lr = scheduler.get_last_lr()[0]
            s_per_batch = (time.time() - train_start_time) / log_interval
            cur_loss = train_loss / log_interval
            cur_acc = train_correct / train_count
            print(f'| epoch {epoch:3d} | {batch_counter:5d}/{num_batches:5d} batches | '
                  f'lr {lr:1.5f} | s/batch {s_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | accuracy {cur_acc:8.2f}')
            total_loss = 0
            train_start_time = time.time()
    
    del 
    gc.collect()

In [9]:
# evaluation (val, test)
def evaluate(model, eval_list, n):
    model.eval()
    eval_loss = 0
    eval_correct = 0
    eval_count = 0

    with torch.no_grad():
        for idx, batch in enumerate(iter(eval_iter)):
            predictions = model(batch.tweet_n.to(device))
            prob = softmax(predictions)
            labels = batch.label.to(device)

            loss = criterion(predictions, labels)

            correct = prob.argmax(axis=1) == labels
            acc = correct.sum().item() / correct.size(0)

            eval_correct += correct.sum().item()
            eval_count += correct.size(0)
            eval_loss += loss.item()
        
    print(f'| loss {eval_loss}| accuracy {eval_correct / ecal_count} ')
        
    return eval_loss, eval_correct / eval_count

In [None]:
# main
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
batch_size= {'1d':16,'12h':32,'4h':64,'1h':256,'30m':512,'15m':1024,'5m':2048}
aculist = {}
for timespan in tlist:
    print(f'{timespan=}')
    dataset_list = data_process(vocab,timespan)
    train_list, test_list = train_test_split(dataset_list, test_size = 1/7, shuffle=False)
    print(f'{len(train_list)=}')
    print(f'{len(test_list)=}')
    
    del dataset_list
    gc.collect()
    
    for n in nlist:
        print(f'{n=}')

        model = Net(ntokens, d_model, nhead, d_hid, nlayers, dropout).to(device)
        lr = 1e-3
        best_val_loss = float('inf')
        epochs = 1
        best_model = None

        dt_start = datetime.datetime.now()
        print(datetime.datetime.now())
        print('***training start***')
        print('-' * 95)

        # training & test roop
        for epoch in range(1, epochs + 1):
            epoch_start_time = time.time()
            train(model, train_list, n)
            val_loss, val_acc = evaluate(model, test_list, n)
            print('-' * 95)
            print(f'| end of epoch {epoch:3d} | time: {time.time()-epoch_start_time:5.2f}s | '
                  f'val loss：{val_loss:5.3f} | val accuracy：{val_acc:8.3f}')
            print('-' * 95)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = copy.deepcopy(model)

            scheduler.step()

            del epoch_start_time, val_loss, val_acc
            gc.collect()

        dt_end = datetime.datetime.now()
        print(datetime.datetime.now())    
        print(f'***Finish! training time：{dt_end - dt_start:5.2f}s***')
        
        # test
        test_loss, test_acc = evaluate(best_model, test_list, n)
        print('=' * 89)
        print(f'| End of training | test loss：{test_loss:5.3f} | '
              f'test accuracy：{test_acc:8.3f}')
        print('=' * 89)
        
        del best_val_loss,epochs,model,best_model,test_loss,test_acc,dt_start,dt_end
        gc.collect()
        
    del train_list, test_list
    gc.collect()



In [5]:
gc.collect()

44

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device0 = torch.device("cuda:0")
device1 = torch.device("cuda:1")
print(torch.cuda.is_available())
print(device)

True
cuda


In [15]:
x = torch.randn(3,3).to(device0)
print(x)
x=x.to(device1)
print(x)

x1 = torch.randn(3,3).to(device0)
x2 = torch.randn(3,3).to(device0)
y1 = torch.randn(3,3).to(device1)
y2 = torch.randn(3,3).to(device1)
print(x1,x2,y1,y2)
l=CreateDataset2([x1,x2],[y1,y2])
print(len(l))
print(l[0])
print(l[1])

tensor([[ 0.6144, -0.9741,  0.0622],
        [-0.1999,  1.0654, -0.1868],
        [ 0.5121,  1.4630, -0.4545]], device='cuda:0')
tensor([[ 0.6144, -0.9741,  0.0622],
        [-0.1999,  1.0654, -0.1868],
        [ 0.5121,  1.4630, -0.4545]], device='cuda:1')
tensor([[ 1.8308,  1.8875, -0.0255],
        [ 0.4251, -1.1523, -1.1985],
        [-0.8290, -0.2899,  0.4874]], device='cuda:0') tensor([[-0.8052, -1.1367, -1.3727],
        [-0.9224,  0.6209,  0.3498],
        [ 0.2493,  1.5550,  0.3182]], device='cuda:0') tensor([[-0.2901, -0.4591, -0.0732],
        [-1.1127, -1.2470, -0.4931],
        [ 1.7144, -1.0427,  0.4638]], device='cuda:1') tensor([[-0.0341, -1.9631, -0.6365],
        [ 0.4210, -0.8261, -0.4116],
        [-0.2698, -0.7567, -0.7895]], device='cuda:1')
2
{'ids': tensor([[ 1.8308,  1.8875, -0.0255],
        [ 0.4251, -1.1523, -1.1985],
        [-0.8290, -0.2899,  0.4874]], device='cuda:0'), 'mask': tensor([[-0.2901, -0.4591, -0.0732],
        [-1.1127, -1.2470, -0.4931],
    

In [12]:
timespan='5m'
dataset_list = data_process(vocab,timespan)


Reading...
Creating Dataset1...
Separating Section...
| 現在  1000000件 終了 | 経過時間 121.30 s |
| 現在  2000000件 終了 | 経過時間 242.78 s |
| 現在  3000000件 終了 | 経過時間 361.34 s |
len(ids_list)=15454
len(mask_list)=15454
Creating Dataset2...
Finish!!
len(dataset)=13136901


In [13]:
print(f'{len(dataset_list)=}')
print(f'{len(dataset_list[0])=}')
print(f'{len(dataset_list[1])=}')
print(f'{len(dataset_list[2])=}')
print(f'{len(dataset_list[1000])=}')

train_list, test_list = train_test_split(dataset_list, test_size = 1/7, shuffle=False)
print(f'{len(train_list)=}')
print(f'{len(test_list)=}')

len(dataset_list)=15454
len(dataset_list[0])=220
len(dataset_list[1])=164
len(dataset_list[2])=139
len(dataset_list[1000])=196
len(train_list)=13246
len(test_list)=2208


In [16]:
print(train_list[0])
dataloader_train = DataLoader(train_list[0], batch_size=1024, shuffle=True)
c=0
for data in dataloader_train:
    print(data['ids'])
    print(data['mask'])
    print(data['ids'].size())
    print(data['mask'].size())
    print('\n')
    c+=1
    if c==5:break
    break

<__main__.CreateDataset2 object at 0x7f08eef24d00>
tensor([[  952,    92,  1622,  ...,     0,     0,     0],
        [  231, 25997,   595,  ...,     0,     0,     0],
        [    6,  7765,   562,  ...,     0,     0,     0],
        ...,
        [  116,   350,    81,  ...,     0,     0,     0],
        [ 1099,   134,     6,  ...,     0,     0,     0],
        [  126,     5,    37,  ...,     0,     0,     0]])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
torch.Size([220, 128])
torch.Size([220, 128])




In [22]:
l=[0,1,2,3,4,5,6,7,8,9]
i=3
n=3
tl = l[i:i+n]
print(tl)

[3, 4, 5]


In [35]:
x1=torch.randn(64,3)
x2=torch.randn(64,3)
x3=torch.randn(64,3)
l=[x1,x2,x3]
out = torch.cat(l, dim=0)
out = out.sum(dim=0)
print(out.size())
print(out)
print(l[1][0,2])

torch.Size([3])
tensor([-14.0435,  -3.9850,   3.4772])
tensor(0.6961)
