In [1]:
import math
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd
import itertools
import datetime
import shutil
import pickle
import random
import time
import copy
import sys
import gc
import os

In [2]:
# pickle書き込み
def write_pickle(filepath, data):
    start_time = time.time()
    print(f'writing pickle to "{filepath}" ...')    
    
    with open(filepath, 'wb') as p:
        pickle.dump(data,p)
    
    print(f'end of writeing {time.time()-start_time:6.2f} s')
    
    del start_time
    gc.collect()

In [3]:
# pickle書き込み
# ログなしVer
def write_pickle_quickly(filepath, data):
    with open(filepath, 'wb') as p:
        pickle.dump(data,p)

In [4]:
# pickle読み出し
def read_pickle(filepath):
    start_time = time.time()
    print(f'reading pickle from "{filepath}" ...')
    
    with open(filepath, 'rb') as p:
        data = pickle.load(p)
    
    print(f'end of reading {time.time()-start_time:6.2f} s')
    
    del start_time
    gc.collect()
    
    return data

In [5]:
# pickle読み出し
# ログなしVer
def read_pickle_quickly(filepath):
    with open(filepath, 'rb') as p:
        data = pickle.load(p)
    return data

In [6]:
# vacab作成
# テキストを単語で分割

v_start = time.time()
print("Reading...")
vocab = read_pickle('../../external_drive/pickle/vocab.pickle')
print('Finish!!')
print(f'{time.time() - v_start:5.2f} s')
del v_start
gc.collect()

'''
v_start = time.time()
tokenizer = get_tokenizer('basic_english')

# data field定義
TEXT  = data.Field(sequential=True,
                     lower=True,
                     batch_first=True, 
                     tokenize=tokenizer,
                     init_token='<cls>')

# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
print("Reading...")
vocab_data = data.TabularDataset(path ='tweet-transformer/1d/2021-17_t.csv',
                                       format='csv',
                                       skip_header = True,
                                       fields=[('tweet', TEXT)])

# 単語辞書の作成
print("Creating vocab...")
TEXT.build_vocab(vocab_data, min_freq=3)
vocab = TEXT.vocab
print(f'{len(vocab)=}')

print('Finish!!')
print(f'{time.time() - v_start:5.2f} s')

# メモリ開放
del v_start, vocab_data, tokenizer, TEXT
gc.collect()
'''

Reading...
reading pickle from "../../external_drive/pickle/vocab.pickle" ...
end of reading   1.90 s
Finish!!
 1.98 s


'\nv_start = time.time()\ntokenizer = get_tokenizer(\'basic_english\')\n\n# data field定義\nTEXT  = data.Field(sequential=True,\n                     lower=True,\n                     batch_first=True, \n                     tokenize=tokenizer,\n                     init_token=\'<cls>\')\n\n# CSVファイルを読み込み、TabularDatasetオブジェクトの作成\nprint("Reading...")\nvocab_data = data.TabularDataset(path =\'tweet-transformer/1d/2021-17_t.csv\',\n                                       format=\'csv\',\n                                       skip_header = True,\n                                       fields=[(\'tweet\', TEXT)])\n\n# 単語辞書の作成\nprint("Creating vocab...")\nTEXT.build_vocab(vocab_data, min_freq=3)\nvocab = TEXT.vocab\nprint(f\'{len(vocab)=}\')\n\nprint(\'Finish!!\')\nprint(f\'{time.time() - v_start:5.2f} s\')\n\n# メモリ開放\ndel v_start, vocab_data, tokenizer, TEXT\ngc.collect()\n'

In [7]:
# Dataset1の定義
# args　：tdf['ids'], tdf['mask']
# return：dataset{ids,mask}

class CreateDataset1(Dataset):
    def __init__(self, x, y):
        self.x = x # tdf['ids']
        self.y = y # tdf['mask']
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        ids  = self.x[index]
        mask = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask}

In [8]:
# Dataset3の定義
# args　：tensor of section, tensor of price, tensor of trend(n+1)
# return：dataset3{section, src, target}
class CreateDataset3(Dataset):
    def __init__(self, x, y, z):
        self.x = x # tensor of section
        self.y = y # tensor of price
        self.z = z # tensor of trend(n+1)
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        section = self.x[index]
        src     = self.y[index]
        target  = self.z[index]

        return {'section': section,
                'src'    : src,
                'target' : target}

In [9]:
# tensor_df()内の関数
max_len = 128
tokenizer = get_tokenizer('basic_english')
def tokenize(text):
    return tokenizer(text)

def text_to_ids(tokenized_text):
    ids  = torch.tensor([vocab[word] for word in tokenized_text], dtype=torch.long).unsqueeze(0) # [1,seq_len]
    ids  = F.pad(ids, (0 ,max_len-len(tokenized_text)), "constant", 0) # [1,max_len]
    return ids

def ids_to_mask(ids):
    mask = (ids==0)
    return mask

In [10]:
# return df[section.ids,mask]
def tensor_df(df):
    start_time = time.time()
    
    print('1.text to ids...')
    df['tweet(n)'] = df['tweet(n)'].apply(tokenize)
    df['ids']      = df['tweet(n)'].apply(text_to_ids)
    print('2.ids to mask...')
    df['mask']     = df['ids'].apply(ids_to_mask)
    df = df.drop(columns=['tweet(n)'])
    
    print(f'end of df to tensor {time.time()-start_time:6.2f} s')
    
    return df

In [11]:
# return : df[section,ids(n),mask(n)]
# sectionごとにtensor連結
train_num_section_list = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
test_num_section_list  = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}

def separate_section(df):
    num_section = train_num_section_list[timespan] + test_num_section_list[timespan]
    section_list = []
    ids_list  = []
    mask_list = []
    for i in range (0,num_section):
        #pandasのSectionkを抽出　idsとmaskをひとまとまりのtensorに
        section_list.append(i)
        df_ids = df[df['section'] == i]['ids']
        l = df_ids.values.tolist()
        x = torch.cat(l, dim=0)
        ids_list.append(x)
        df_mask = df[df['section'] == i]['mask']
        l = df_mask.values.tolist()
        x = torch.cat(l, dim=0)
        mask_list.append(x)
    
    df = pd.DataFrame(list(zip(section_list, ids_list, mask_list)), columns = ['section','ids','mask'])
    
    return df        

In [12]:
# Datasetの作成 (ツイート)
# 1. df to tensor
# 2. separate_section
def data_process1(timespan):
    print('-'*5 + 'Create dataset_tlist start!!' + '-'*5)

    print('Reading...')
    dfs = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_s.csv')
    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_t.csv')
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)
    
    print('df to tensor...')
    df = tensor_df(df)
    
    print('Separating Section...')
    df = separate_section(df)
        
    train_tdf, test_tdf = train_test_split(df, test_size = 1/7, shuffle=False)
    
    print('Finish!!')
    print(f'{len(train_tdf)=}')
    print(f'{len(test_tdf)=}')
    
    return train_tdf, test_tdf

In [13]:
# Datasetの作成 (価格)
# 1.csv -> 3 tensor
# 2.CreateDataset3
def data_process2(timespan,n):
    print('-'*5 + 'Create dataset_plist start!!' + '-'*5)

    print('Reading...')
    df  = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_b.csv')
    dfs = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_s.csv')

    # 説明変数、目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)
    df['end_price(n)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            df[f'end_price(n-{i})'] = df['end_price(n)'].shift(i)
    df = df.drop(columns=['open_price(n)'])
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)

    # マージして欠損値を含む行を処理
    df = pd.merge(dfs, df, on="section", how = 'left')
    df = df.reset_index(drop=True)
    df['bool'] = df.isnull().any(axis=1)
    for i in range(0, len(df)):
        if  df['bool'][i] == True:
            df['section'][i] = -1
    df = df.fillna(-1)
    df = df.drop(columns=['bool'])
    print(f'{len(df)=}')

    train_df, test_df = train_test_split(df, test_size = 1/7, shuffle=False)

    # 3つのtensorを作成
    print('Creating Dataset3...')
    section = torch.tensor(train_df['section'].values)
    target  = torch.tensor(train_df['trend(n+1)'].values)
    price   = torch.tensor(train_df.drop(columns=['trend(n+1)','section']).values)
    train_plist = CreateDataset3(section, price, target)

    section = torch.tensor(test_df['section'].values)
    target  = torch.tensor(test_df['trend(n+1)'].values)
    price   = torch.tensor(test_df.drop(columns=['trend(n+1)','section']).values)
    test_plist = CreateDataset3(section, price, target)


    print('Finish!!')
    print(f'{len(train_plist)=}')
    print(f'{len(test_plist)=}')
    
    del df,dfs,section,price,target,train_df,test_df
    gc.collect()
    
    return train_plist, test_plist

In [14]:
# データセット作成 & csv, pickleに保存
# ツイート
'''
tlist = ['1d','12h','4h','1h','30m','15m','5m']
train_section_list     = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
test_section_list      = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}

for timespan in tlist:
    print('-'*50 + f'{timespan=}' + '-'*50)
    train_tdf, test_tdf = data_process1(timespan)
    print(f'{sys.getsizeof(train_tdf)=}')
    print(f'{sys.getsizeof(test_tdf)=}')
    train_tdf.to_csv(f'../../external_drive/pandas/{timespan}/train_tdf.csv', index=False)
    test_tdf.to_csv(f'../../external_drive/pandas/{timespan}/test_tdf.csv', index=False) 
    write_pickle(f'../../external_drive/pickle/{timespan}/train_tdf.pickle',train_tdf)
    write_pickle(f'../../external_drive/pickle/{timespan}/test_tdf.pickle',test_tdf)
'''

"\ntlist = ['1d','12h','4h','1h','30m','15m','5m']\ntrain_section_list     = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}\ntest_section_list      = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}\n\nfor timespan in tlist:\n    print('-'*50 + f'{timespan=}' + '-'*50)\n    train_tdf, test_tdf = data_process1(timespan)\n    print(f'{sys.getsizeof(train_tdf)=}')\n    print(f'{sys.getsizeof(test_tdf)=}')\n    train_tdf.to_csv(f'../../external_drive/pandas/{timespan}/train_tdf.csv', index=False)\n    test_tdf.to_csv(f'../../external_drive/pandas/{timespan}/test_tdf.csv', index=False) \n    write_pickle(f'../../external_drive/pickle/{timespan}/train_tdf.pickle',train_tdf)\n    write_pickle(f'../../external_drive/pickle/{timespan}/test_tdf.pickle',test_tdf)\n"

In [15]:
# データセット作成 & pickleに保存
# 価格
'''
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']

for timespan, n in itertools.product(tlist, nlist):   

    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)
    train_plist,test_plist = data_process2(timespan,n)
    write_pickle(f'../../external_drive/pickle/{timespan}/train_plist_{n}.pickle',train_plist)
    write_pickle(f'../../external_drive/pickle/{timespan}/test_plist_{n}.pickle',test_plist)
    
    del train_plist, test_plist
    gc.collect()
'''

"\nnlist = [1,2,3,4,5,6,7,8,9,10]\ntlist = ['1d','12h','4h','1h','30m','15m','5m']\n\nfor timespan, n in itertools.product(tlist, nlist):   \n\n    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)\n    train_plist,test_plist = data_process2(timespan,n)\n    write_pickle(f'../../external_drive/pickle/{timespan}/train_plist_{n}.pickle',train_plist)\n    write_pickle(f'../../external_drive/pickle/{timespan}/test_plist_{n}.pickle',test_plist)\n    \n    del train_plist, test_plist\n    gc.collect()\n"

In [9]:
# parametator for Net
ntokens = len(vocab)  # size of vocabulary
d_model = 512   # embedding dimension
nhead   = 8     # number of heads in nn.MultiheadAttention
d_hid   = 2048  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6     # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2   # dropout probability
lstm_input_dim  = 5
lstm_hidden_dim = 16

In [10]:
# Transformer-LSTMモデルの概要
class Transformer(nn.Module):

    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.embedding = nn.Embedding(ntoken,
                                d_model)
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout)
        encoder_layers = TransformerEncoderLayer(d_model,
                                                 nhead,
                                                 d_hid,
                                                 dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,
                                                      nlayers)
        self.dense = nn.Linear(d_model,3)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.dense.bias.data.zero_()
        self.dense.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, ids, mask):

        # Transformerによるテキストの3値分類           
        x = self.embedding(ids) * math.sqrt(self.d_model) # [batch_size, seq_len, d_model]
        x = self.pos_encoder(x) # [batch_size, seq_len, d_model]
        x = self.transformer_encoder(src=x, src_key_padding_mask=mask) # [batch_size, seq_len, d_model]
        x[x != x] = 0 #Nanを0に置き換え
        x = x.mean(dim=1) # [batch_size, d_model]
        x = self.dense(x) # [batch_size, 3]

        return x

In [11]:
# Transformer-LSTMモデルの概要
class LSTM(nn.Module):

    def __init__(self,
                 lstm_input_dim: int, 
                 lstm_hidden_dim: int):

        super().__init__()
        self.input_dim = lstm_input_dim
        self.hidden_dim = lstm_hidden_dim
        self.lstm = nn.LSTM(input_size=lstm_input_dim, 
                            hidden_size=lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True)
        self.dense = nn.Linear(lstm_hidden_dim,3)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.dense.bias.data.zero_()
        self.dense.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, src):
            
        # LSTMによるテキスト＋価格の３値分類
        _, x = self.lstm(src)
        x = self.dense(x[0].view(src.size(0), -1))
        return x

In [12]:
# PositionalEncodingの概要
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        '''
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        '''
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [13]:
# paramator for training & evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
softmax = nn.Softmax(dim=1)
criterion = nn.CrossEntropyLoss()
torch.manual_seed(0)

<torch._C.Generator at 0x7f3e60064fd0>

In [14]:
# Transformerによるテキストの3値分類
tbatch_size_list = {'4h':512, '30m':512, '5m':128}

def text_classifer(t_start,t_end, tdf):
    tbatch_size = tbatch_size_list[timespan]
    idx_counter_list=[]
    
    if timespan == '5m' or '30m':
        r = 20
    else:
        r = 50
    
    for j in range(t_start,t_end):   
        tlist = CreateDataset1(tdf['ids'][j], tdf['mask'][j])
        tbatches = iter(DataLoader(tlist, batch_size=tbatch_size_list[timespan], shuffle=True))
        idx_counter = 0

        for idx,batch in enumerate(tbatches): 
            if idx % r == 0:
                idx_counter += 1
                ids  =  batch['ids'].to(device)  # [batch_size, seq_len]
                mask =  torch.t(batch['mask']).to(device) # [seq_len, batch_size]
                text_class = model1(ids, mask)  # [batch_size, 3]
                prob = softmax(text_class) # [batch_size, 3]
                write_pickle_quickly(f'../../external_drive/pickle/temp/{j}_{idx}.pickle',prob.to('cpu'))
                del ids, mask, text_class, prob
                torch.cuda.empty_cache()
                gc.collect()
        
        idx_counter_list.append(idx_counter)
        
    return idx_counter_list # list of tesnsor[3]

In [15]:
#train_plist_b[j] のtensorと　prob_sectionのtensorを結合
def create_tensor(t_start,t_end,idx_counter_list, plist):
    if timespan == '5m' or '30m':
        r = 20
    else:
        r = 50
    
    prob_section = []
    for j in range(t_start,t_end):
        p_tbatch = []
        for idx in range(0, idx_counter_list[j-t_start]):
            prob = read_pickle_quickly(f'../../external_drive/pickle/temp/{j}_{idx*r}.pickle') # [batch_size, 3]
            p_tbatch.append(prob)
            os.remove(f'../../external_drive/pickle/temp/{j}_{idx*r}.pickle')
        x = torch.cat(p_tbatch, dim=0) # [batch_size*num_tbatches, 3]
        x = x.sum(dim=0) # [3]
        prob_section.append(x)
        
    src    = []
    target = []
    
    for j in range(t_start+n-1,t_end):
        if plist[j]['section'] != -1:
            l=[]
            x = plist[j]['src'] #[2n]
            y = torch.cat(prob_section[j-(t_start+n-1): j-(t_start+n-1)+n], dim=-1) # [3n]
            for k in range (0,n):   
                z = torch.cat((x[k*2:(k+1)*2], y[3*n-(k+1)*3:3*n-k*3]), dim=-1).unsqueeze(0) # [1,5]
                l.append(z)
            z = torch.cat(l, dim=0).unsqueeze(0) # [1,n,5]
            src.append(z)
            target.append(plist[j]['target'].unsqueeze(0))
    src = torch.cat(src, dim=0).to(torch.float) # [batch_size, n, 5]
    target = torch.cat(target, dim=-1).to(torch.long) # [batch_size]
    
    return src,target

In [16]:
# training
def train(model1,model2, timespan, n,train_tdf, train_plist):
    model1.train()
    model2.train()
    
    log_interval = math.ceil(train_num_batches/30)*10
    batch_counter = 0
    train_loss = 0
    train_correct = 0
    train_count = 0
    
    batch_start_time = time.time()
    
    for i in range(0, train_num_batches):
        a=time.time()
        if i*batch_size-n+1 >= 0:      
            if i != (train_num_batches-1):
                t_start = i*batch_size-n+1
                t_end   = i*batch_size+batch_size
            else:
                t_start = i*batch_size-n+1
                t_end   = len(train_plist)
        else:
            if i != (train_num_batches-1):
                t_start = 0
                t_end   = i*batch_size+batch_size
            else:
                t_start = 0
                t_end   = len(train_plist)
        
        idx_counter_list = text_classifer(t_start, t_end, train_tdf)
        src, target = create_tensor(t_start, t_end, idx_counter_list, train_plist)
        predictions  = model2(src.to(device))
        prob = softmax(predictions)
        targets = target.to(device)
        loss = criterion(predictions, targets)

        correct = prob.argmax(axis=1) == targets
        acc = correct.sum().item() / correct.size(0)

        train_correct += correct.sum().item()
        train_count += correct.size(0)
        train_loss += loss.item()

        optimizer1.zero_grad()
        optimizer2.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model1.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 0.5)
        optimizer1.step()
        optimizer2.step()

        batch_counter += 1
        
        print(f'{i=} / {time.time()-a}')

        if batch_counter % log_interval == 0 or batch_counter == train_num_batches:
            lr = scheduler2.get_last_lr()[0]
            s_per_batch = (time.time() - batch_start_time) / log_interval
            cur_loss = train_loss / log_interval
            cur_acc = train_correct / train_count
            print(f'| epoch {epoch:3d} | {batch_counter:5d}/{train_num_batches:5d} batches | '
                  f'lr {lr:1.5f} | s/batch {s_per_batch:5.2f} | '
                  f'loss {cur_loss:5.3f} | accuracy {cur_acc:8.3f}')
            total_loss = 0
            batch_start_time = time.time()

In [17]:
# evaluation (val, test)
def evaluate(model1, model2, timespan, n, test_tdf, test_plist):
    model1.eval()
    model2.eval()  
    
    eval_loss = 0
    eval_correct = 0
    eval_count = 0

    with torch.no_grad():
        for i in range(0, test_num_batches):
                
            if i*batch_size-n+1 >= 0:      
                if i != (test_num_batches-1):
                    t_start = i*batch_size-n+1
                    t_end   = i*batch_size+batch_size
                else:
                    t_start = i*batch_size-n+1
                    t_end   = len(test_plist)
            else:
                if i != (test_num_batches-1):
                    t_start = 0
                    t_end   = i*batch_size+batch_size
                else:
                    t_start = 0
                    t_end   = len(test_plist)
        
            idx_counter_list = text_classifer(t_start, t_end, test_tdf)
            src, target = create_tensor(t_start, t_end, idx_counter_list, test_plist)
            predictions  = model2(src.to(device))
            prob = softmax(predictions)
            targets = target.to(device)
            loss = criterion(predictions, targets)

            eval_loss += loss.item()
            eval_acc = accuracy_score(prob.argmax(axis=1), targets)
            eval_cm  = confusion_matrix(prob.argmax(axis=1), targets)
            eval_cr  = classification_report(prob.argmax(axis=1), targets, target_names=['down','stay','up'], output_dict=True)
        
    print(f'| loss {eval_loss:5.3f}| accuracy {eval_acc:8.3f} ')
     
    return eval_loss, eval_acc, eval_cm, eval_cr

In [None]:
# main
nlist = [2,3,4]
tlist = ['4h','30m','5m']
batch_size_list        = {'4h':64, '30m':512,  '5m':1024}
train_num_section_list = {'4h':1090,'30m':8722,'5m':52333}
test_num_section_list  = {'4h':182, '30m':1454,'5m':8723}

#batch_size_list = {'4h':8, '30m':8,  '5m':8}
#nlist = [2]
#tlist = ['4h']

for n, timespan in itertools.product(nlist,tlist):

    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)
    train_num_section = train_num_section_list[timespan]
    test_num_section  = test_num_section_list[timespan]
    batch_size=batch_size_list[timespan]
    train_num_batches = (train_num_section // batch_size) +1
    test_num_batches  = (test_num_section // batch_size) +1

    lr = 1e-3
    model1 = Transformer(ntokens, d_model, nhead, d_hid, nlayers, dropout).to(device)
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=lr)
    scheduler1 = torch.optim.lr_scheduler.StepLR(optimizer1, 1.0, gamma=0.95)
    model2 = LSTM(lstm_input_dim, lstm_hidden_dim).to(device)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr)
    scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer2, 1.0, gamma=0.95)
    best_val_loss = float('inf')
    epochs = 50
    best_model = None
    
    train_tdf   = read_pickle_quickly(f'../../external_drive/pickle/{timespan}/train_tdf.pickle')
    test_tdf    = read_pickle_quickly(f'../../external_drive/pickle/{timespan}/test_tdf.pickle')
    train_plist = read_pickle_quickly(f'../../external_drive/pickle/{timespan}/train_plist_{n}.pickle')
    test_plist  = read_pickle_quickly(f'../../external_drive/pickle/{timespan}/test_plist_{n}.pickle')

    dt_start = datetime.datetime.now()
    print(datetime.datetime.now())
    print('*'*45 + 'training start' + '*'*45)

    # training & test roop
    for epoch in range(1, epochs + 1):
        epoch_start = time.time()

        train(model1, model2, timespan, n, train_tdf, train_plist)
        val_loss, val_acc, _, _ = evaluate(model1, model2, timespan, n, test_tdf, test_plist)

        print('-' * 95)
        print(f'| end of epoch {epoch:3d} | time: {time.time()-epoch_start:5.2f}s | '
              f'val loss：{val_loss:5.3f} | val accuracy：{val_acc:8.3f}')
        print('-' * 95)
        
        # 結果をcsvに保存
        df_log = pd.read_csv('tweet-transformer/loss_log.csv')
        s = pd.Series([timespan,
                       n,
                       epoch,
                       val_loss,
                       vall_acc],
                index=['timespan',
                       'range_of_data'
                       'epoch',
                       'val_loss',
                       'val_accuracy'])
        df_log = df_log.append(s, ignore_index=True)
        df_log.to_csv('tweet-transformer/loss_log.csv',index=False)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model1 = copy.deepcopy(model1)
            best_model2 = copy.deepcopy(model2)

        scheduler1.step()
        scheduler2.step()

    dt_end = datetime.datetime.now()
    elapsed = dt_end - dt_start
    print(datetime.datetime.now())    
    print('*'*30 + f'Finish! training time：{elapsed:8.2f}s' + '*'*30)

    # test
    test_loss, test_acc, test_cm, test_cr = evaluate(best_model1, best_model2, timespan, n, test_tdf, test_plist)
    print('=' * 89)
    print(f'| End of training | test loss：{test_loss:5.3f} | '
          f'test accuracy：{test_acc:8.3f}')
    print('=' * 89)
    
    # 結果をcsvに保存
    df_log = pd.read_csv('tweet-transformer/transformer_log.csv')
    s = pd.Series([dt_start,
                   dt_end,
                   timespan,
                   n,
                   test_loss,
                   test_acc,
                   test_cm,
                   test_cr],
            index=['start_time',
                   'end_time',
                   'timespan',
                   'range_of_data',
                   'test_loss',
                   'test_accuracy',
                   'confusion_matrix',
                   'score_report'])
    df_log = df_log.append(s, ignore_index=True)
    df_log.to_csv('tweet-transformer/transformer_log.csv',index=False)

In [None]:
#device = 'cpu'
#device = torch.device("cuda:1")
print(device)
#torch.cuda.is_initialized()
#torch.cuda.ipc_collect()
#torch.cuda.empty_cache()
#gc.collect()
#print(f'{sys.getsizeof(l)=}')

#batch_size_list        = {'1d':16, '12h':32, '4h':64,  '1h':16, '30m':512, '15m':1024, '5m':1024}
#train_num_section_list = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
#test_num_section_list  = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}

In [10]:
#log リセット
# test用
'''
df = pd.DataFrame(columns=['start_time',
                           'end_time',
                           'timespan',
                           'range_of_data',
                           'test_loss',
                           'test_accuracy',
                           'confusion_matrix',
                           'score_report'])
df.to_csv('tweet-transformer/transformer_log.csv',index=False)
# epoch毎のvalidation用
df = pd.DataFrame(columns=['timespan',
                           'range_of_data',
                           'epoch',
                           'val_loss',
                           'val_accuracy'])
df.to_csv('tweet-transformer/loss_log.csv',index=False)
'''