In [58]:
import math
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import pandas as pd
import random
from tqdm import tqdm
import time
import datetime
import copy
import gc

In [7]:
# vacab作成
# テキストを単語で分割
v_start = time.time()
tokenizer = get_tokenizer('basic_english')

# data field定義
TEXT  = data.Field(sequential=True,
                     lower=True,
                     batch_first=True, 
                     tokenize=tokenizer,
                     init_token='<cls>')

print("データ読み込み中")
# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
vocab_data = data.TabularDataset(path ='tweet-transformer/1d/2021-17.csv',
                                       format='csv',
                                       skip_header = True,
                                       fields=[('tweet', TEXT)])
print("データ読み込み完了")

# 単語辞書の作成
TEXT.build_vocab(vocab_data, min_freq=3)
vocab = TEXT.vocab
print(f'{len(vocab)=}')

print('辞書作成完了')
print(f'{time.time() - v_start:5.2f} s')

# メモリ開放
del v_start, vocab_data, tokenizer, TEXT
gc.collect()

データ読み込み中
データ読み込み完了
len(vocab)=646245
辞書作成完了
334.53 s


3687

In [92]:
# Datasetの定義
class CreateDataset1(Dataset):
    def __init__(self, x, y, tokenizer, vocab, max_len):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_len = max_len

    # len(Dataset)で返す値を指定    
    def __len__(self):
        return len(self.y)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        text = self.x[index]
        text = self.tokenizer(text)
        ids  = torch.tensor((self.vocab[word] for word in text), dtype=torch.long) # [seq_len]
        ids  = F.pad(ids, (0 ,max_len-len(text)), "constant", 0) # [max_len]
        mask = (ids==0)

        section = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask,
                'section': torch.Tensor([section])}

In [60]:
def separate_section(dataset, device):
    ids_list  = []
    mask_list = []
    start_time = time.time()
    counter = -1
    
    for k in range(0, len(dataset)):
        i = dataset[k]['ids'].squeeze().to(device)
        m = dataset[k]['mask'].squeeze().to(device)
        if counter != dataset[k]['section']:
            ids_list.append([i])
            mask_list.append([m])
            counter += 1
        else:
            ids_list[counter].append(i)
            mask_list[counter].append(m)
            
        if k%3000000==0 and k!=0:
            print(f'現在{k}件終了')
            print(f'経過時間{start_time - time.time()}s')
    
    print(f'{len(ids_list)=}')
    print(f'{len(mask_list)=}')
    
    del i, m, k, start, counter
    gc.collect()
    
    return ids_list, mask_list

In [61]:
# Datasetの定義
class CreateDataset2(Dataset):
    def __init__(self, x, y, device):
        self.x = x
        self.y = y
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        ids  = self.x[index]
        mask = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask}

In [62]:
# Datasetの作成 (ツイート)
# 1. CreateDataset1
# 2. separate_section
# 3. CreateDataset2
def data_process(vocab,timespan,device):
    max_len = 128
    tokenizer = get_tokenizer('basic_english')

    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17.csv')
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)
    dataset = CreateDataset1(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)
    ids_list, mask_list = separate_section(dataset, device)
    
    dataset_list = []
    for i in range(0, len(ids_list)):
        x = CreateDataset2(ids_list[i], mask_list[i], device)
        dataset_list.append(x)

    print('dataset作成完了')
    print(f'{len(dataset)=}')
    
    del max_len, tokenizer, df, ids_list, mask_list, dataset, x, i
    gc.collect()
    
    return dataset_list

In [None]:
# nごとにバッチ化 
def n_batch(sbatches):
    

In [4]:
# parametator for Net
ntokens = len(vocab)  # size of vocabulary
d_model = 512  # embedding dimension
nhead   = 8    # number of heads in nn.MultiheadAttention
d_hid   = 2048  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6    # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2  # dropout probability

In [5]:
# Transformerモデルの概要
class Net(nn.Module):

    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.emb = nn.Embedding(ntoken,
                                d_model,
                                padding_idx=0)
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout)
        encoder_layers = TransformerEncoderLayer(d_model,
                                                 nhead,
                                                 d_hid,
                                                 dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,
                                                      nlayers)
        self.classifer = nn.Linear(d_model,3)
        #self.softmax = nn.Softmax()
        
        self.input_dim = lstm_input_dim
        self.hidden_dim = lstm_hidden_dim
        self.lstm = nn.LSTM(input_size=lstm_input_dim, 
                            hidden_size=lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True
                            )
        
        

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.emb.weight.data.uniform_(-initrange, initrange)
        self.classifer.bias.data.zero_()
        self.classifer.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, src: Tensor) -> Tensor:
        '''
        Args:
            src: Tensor, shape [batch_size, seq_len]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [batch_size, nclass:3]
        '''
# src_key_padding_mask = src_mask
        
#        for i in range(1,num_batches+1)
            embedded = self.emb(src) * math.sqrt(self.d_model)
            pos = self.pos_encoder(embedded)
            encoder_out = self.transformer_encoder(pos)
            x = encoder_out.mean(dim=1)
            output = self.classifer(x)
            
        
        #output = self.softmax(output)
        return output
        

In [6]:
# PositionalEncodingの概要
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        '''
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        '''
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
# paramator for training & evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(ntokens, d_model, nhead, d_hid, nlayers, dropout).to(device)
lr = 1e-3
softmax = nn.Softmax(dim=1)
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
torch.manual_seed(0)

In [8]:
# training
def train(model: nn.Module, train_iter: Tensor):
    train_start_time = time.time()
    model.train()
    num_batches = len(train_iter)
    log_interval = math.ceil(num_batches/30)*10
    batch_counter = 0
    train_loss = 0
    train_correct = 0
    train_count = 0
    
    for idx, batch in enumerate(iter(train_iter)):
        predictions = model(batch.tweet_n.to(device))
        prob = softmax(predictions)
        labels = batch.label.to(device)

        loss = criterion(predictions, labels)
        
        correct = prob.argmax(axis=1) == labels
        acc = correct.sum().item() / correct.size(0)

        train_correct += correct.sum().item()
        train_count += correct.size(0)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        batch_counter += 1
        
        if batch_counter % log_interval == 0 or batch_counter == num_batches:
            lr = scheduler.get_last_lr()[0]
            s_per_batch = (time.time() - train_start_time) / log_interval
            cur_loss = train_loss / log_interval
            cur_acc = train_correct / train_count
            print(f'| epoch {epoch:3d} | {batch_counter:5d}/{num_batches:5d} batches | '
                  f'lr {lr:1.5f} | s/batch {s_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | accuracy {cur_acc:8.2f}')
            total_loss = 0
            train_start_time = time.time()

In [9]:
# evaluation (val, test)
def evaluate(model: nn.Module, eval_iter: Tensor):
    model.eval()
    eval_loss = 0
    eval_correct = 0
    eval_count = 0

    with torch.no_grad():
        for idx, batch in enumerate(iter(eval_iter)):
            predictions = model(batch.tweet_n.to(device))
            prob = softmax(predictions)
            labels = batch.label.to(device)

            loss = criterion(predictions, labels)

            correct = prob.argmax(axis=1) == labels
            acc = correct.sum().item() / correct.size(0)

            eval_correct += correct.sum().item()
            eval_count += correct.size(0)
            eval_loss += loss.item()
        
    print(f'| loss {eval_loss}| accuracy {eval_correct / ecal_count} ')
        
    return eval_loss, eval_correct / eval_count

In [None]:
# main
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = {}
for timespan in tlist:
    print(f'{timespan=} データ読み込み中')
    train_iter_list, val_iter_list, test_iter_list = data_process(timespan)
    
    for n in nlist:
        print(f'{n=}')

        best_val_loss = float('inf')
        epochs = 1
        best_model = None

        dt_start = datetime.datetime.now()
        print(datetime.datetime.now())
        print('学習開始')
        print('-' * 95)

        # training & validation roop
        for epoch in range(1, epochs + 1):
            epoch_start_time = time.time()
            train(model, train_iter_list)
            val_loss, val_acc = evaluate(model, val_iter_list)
            elapsed = time.time() - epoch_start_time
            print('-' * 95)
            print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
                  f'valid loss {val_loss:5.2f} | valid accuracy {val_acc:8.2f}')
            print('-' * 95)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = copy.deepcopy(model)

            scheduler.step()

            del epoch_start_time, elapsed, vall_loss, vall_acc
            gc.collect()

        dt_end = datetime.datetime.now()
        print(datetime.datetime.now())    
        print(f'経過時間：{dt_end - dt_start}')
        print('学習終了')
        
        # test
        test_loss, test_acc = evaluate(best_model, test_iter_list)
        print('=' * 89)
        print(f'| End of training | test loss {test_loss:5.2f} | '
              f'test accuracy {test_acc:8.2f}')
        print('=' * 89)
        
        del best_val_loss,epochs,best_model,test_loss,test_acc,dt_start,dt_end
        gc.collect()
        
    del train_iter_list, val_iter_list, test_iter_list
    gc.collect()


In [72]:
gc.collect()

72

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())
print(device)

True
cuda


In [93]:
# テスト用
max_len = 128
tokenizer = get_tokenizer('basic_english')

# CSVファイルを読み込み、データセットを作成
df = pd.read_csv(f'tweet-transformer/1d/2021-17.csv')
print(f'{len(df)=}')
df = df.dropna(how='any')
df = df.reset_index(drop=True)
print(f'{len(df)=}')
dataset = CreateDataset(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)

print('dataset作成完了')
print(f'{len(dataset)=}')

len(df)=13146832
len(df)=13136901
dataset作成完了
len(dataset)=13136901


In [94]:
gc.collect()

81

In [36]:
x=torch.tensor([[1,2,3],
                [2,3,1],
                [3,1,2],
                [1,2,3],
                [2,3,1]],dtype=torch.float)
softmax = nn.Softmax(dim=1)
prob = softmax(x)
labels = torch.tensor([1,1,1,1,1])
correct = prob.argmax(axis=1) == labels
acc = correct.sum().item() / correct.size(0)
print(correct)
print(acc)
print(prob.size())
print(labels.size())

tensor([False,  True, False, False,  True])
0.4
torch.Size([5, 3])
torch.Size([5])


In [100]:
ids_list  = []
mask_list = []
start_time = time.time()
counter = -1

for k in range(0, len(dataset)):
    i = dataset[k]['ids'].squeeze().to(device)
    m = dataset[k]['mask'].squeeze().to(device)
    if counter != dataset[k]['section']:
        ids_list.append([i])
        mask_list.append([m])
        counter += 1
    else:
        ids_list[counter].append(i)
        mask_list[counter].append(m)

    if k%300000==0 and k!=0:
        print(f'現在{k}件終了')
        print(f'経過時間{start_time - time.time()}s')
    if k==600000: break

print(f'{len(ids_list)=}')
print(f'{len(mask_list)=}')

現在300000件終了
経過時間-48.86335802078247s
現在600000件終了
経過時間-98.41774201393127s
len(ids_list)=11
len(mask_list)=11


In [101]:
dataset_list=[]
for i in range(0, len(ids_list)):
    x = CreateDataset2(ids_list[i], mask_list[i], device)
    dataset_list.append(x)
dataloader_train = DataLoader(dataset_list[0], batch_size=1024, shuffle=True)

In [102]:
for data in dataloader_train:
    print(data['ids'])
    print(data['ids'].size())
    print(data['mask'].size())
    print('\n')
    break

tensor([[  1169,     16,    247,  ...,      0,      0,      0],
        [120121,    520,    137,  ...,      0,      0,      0],
        [   213,   1556,   3311,  ...,      0,      0,      0],
        ...,
        [  4773,   1329,     18,  ...,      0,      0,      0],
        [     5,   3534,     74,  ...,      0,      0,      0],
        [   107,     40,    827,  ...,      0,      0,      0]],
       device='cuda:0')
torch.Size([1024, 128])
torch.Size([1024, 128])




In [97]:
print(dataset[0]['mask'])
print(dataset[1]['ids'].size())

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  