In [1]:
import math
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import random
from tqdm import tqdm
import time
import datetime
import itertools
import pickle
import sys
import copy
import gc

In [2]:
# pickle書き込み
def write_pickle(filepath, data):
    start_time = time.time()
    print(f'writing pickle to "{filepath}" ...')    
    
    with open(filepath, 'wb') as p:
        pickle.dump(data,p)
    
    print(f'end of writeing {time.time()-start_time:6.2f} s')
    
    del start_time
    gc.collect()

In [3]:
# pickle読み出し
def read_pickle(filepath):
    start_time = time.time()
    print(f'reading pickle from "{filepath}" ...')
    
    with open(filepath, 'rb') as p:
        data = pickle.load(p)
    
    print(f'end of reading {time.time()-start_time:6.2f} s')
    
    del start_time
    gc.collect()
    
    return data

In [4]:
# vacab作成
# テキストを単語で分割

v_start = time.time()
print("Reading...")
vocab = read_pickle('../../external_drive/pickle/vocab.pickle')
print('Finish!!')
print(f'{time.time() - v_start:5.2f} s')
del v_start
gc.collect()

'''
v_start = time.time()
tokenizer = get_tokenizer('basic_english')

# data field定義
TEXT  = data.Field(sequential=True,
                     lower=True,
                     batch_first=True, 
                     tokenize=tokenizer,
                     init_token='<cls>')

# CSVファイルを読み込み、TabularDatasetオブジェクトの作成
print("Reading...")
vocab_data = data.TabularDataset(path ='tweet-transformer/1d/2021-17_t.csv',
                                       format='csv',
                                       skip_header = True,
                                       fields=[('tweet', TEXT)])

# 単語辞書の作成
print("Creating vocab...")
TEXT.build_vocab(vocab_data, min_freq=3)
vocab = TEXT.vocab
print(f'{len(vocab)=}')

print('Finish!!')
print(f'{time.time() - v_start:5.2f} s')

# メモリ開放
del v_start, vocab_data, tokenizer, TEXT
gc.collect()
'''

Reading...
reading pickle from "../../external_drive/pickle/vocab.pickle" ...
end of reading   1.85 s
Finish!!
 1.93 s


'\nv_start = time.time()\ntokenizer = get_tokenizer(\'basic_english\')\n\n# data field定義\nTEXT  = data.Field(sequential=True,\n                     lower=True,\n                     batch_first=True, \n                     tokenize=tokenizer,\n                     init_token=\'<cls>\')\n\n# CSVファイルを読み込み、TabularDatasetオブジェクトの作成\nprint("Reading...")\nvocab_data = data.TabularDataset(path =\'tweet-transformer/1d/2021-17_t.csv\',\n                                       format=\'csv\',\n                                       skip_header = True,\n                                       fields=[(\'tweet\', TEXT)])\n\n# 単語辞書の作成\nprint("Creating vocab...")\nTEXT.build_vocab(vocab_data, min_freq=3)\nvocab = TEXT.vocab\nprint(f\'{len(vocab)=}\')\n\nprint(\'Finish!!\')\nprint(f\'{time.time() - v_start:5.2f} s\')\n\n# メモリ開放\ndel v_start, vocab_data, tokenizer, TEXT\ngc.collect()\n'

In [None]:
# Dataset1の定義
# args　：df[tweet(n)], df[section]
# return：dataset1{ids,mask,section}
'''
class CreateDataset1(Dataset):
    def __init__(self, x, y, tokenizer, vocab, max_len):
        self.x = x # df['tweet(n)']
        self.y = y # df['section']
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_len = max_len

    # len(Dataset)で返す値を指定    
    def __len__(self):
        return len(self.y)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        text = self.x[index]
        text = self.tokenizer(text)
        ids  = torch.tensor([self.vocab[word] for word in text], dtype=torch.long) # [seq_len]
        ids  = F.pad(ids, (0 ,self.max_len-len(text)), "constant", 0) # [max_len]
        mask = (ids==0)

        section = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask,
                'section': torch.Tensor([section])}

In [None]:
# Dataset1をセクションごとにリスト分割
# args　：dataset1
# return：ids_list, mask_list
'''
def separate_section(dataset):
    ids_list  = []
    mask_list = []
    start_time = time.time()
    counter = -1
    
    ('-' * 95)
    for k in range(0, len(dataset)):
        i = dataset[k]['ids'].squeeze()
        m = dataset[k]['mask'].squeeze()
            
        if counter != dataset[k]['section']:
            ids_list.append([i])
            mask_list.append([m])
            counter += 1
        else:
            ids_list[counter].append(i)
            mask_list[counter].append(m)
            
        if k%1000000==0 and k!=0:
            elapsed = time.time()-start_time
            print(f'| 現在 {k:8d}件 終了 | 経過時間 {elapsed:8.2f} s |')
    ('-' * 95)
    
    print(f'{len(ids_list)=}')
    print(f'{len(mask_list)=}')
    
    del i, m, k, start_time, counter
    gc.collect()
    
    return ids_list, mask_list

In [6]:
# Dataset2の定義
# args　：tdf['ids'], tdf['mask']
# return：dataset{ids,mask}

class CreateDataset2(Dataset):
    def __init__(self, x, y):
        self.x = x # tdf['ids']
        self.y = y # tdf['mask']
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        ids  = self.x[index]
        mask = self.y[index]

        return {'ids'   : ids,
                'mask'  : mask}

In [None]:
# Datasetの作成 (ツイート)
# 1. CreateDataset1
# 2. separate_section
# 3. CreateDataset2
'''
def data_process1(vocab,timespan):
    print('-'*5 + 'Create dataset_tlist start!!' + '-'*5)
    max_len = 128
    tokenizer = get_tokenizer('basic_english')

    print('Reading...')
    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_t.csv')
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)
    
    print('Creating Dataset1...')
    dataset = CreateDataset1(df['tweet(n)'],  df['section'],  tokenizer, vocab, max_len)
    
    print('Separating Section...')
    ids_list, mask_list = separate_section(dataset)
    
    print('Creating Dataset2...')
    dataset_tlist = []
    for i in range(0, len(ids_list)):
        x = CreateDataset2(ids_list[i], mask_list[i])
        dataset_tlist.append(x)
    
    print('Finish!!')
    print(f'{len(dataset_tlist)=}')
    
    del max_len, tokenizer, df, ids_list, mask_list, dataset, x, i
    gc.collect()
    
    return dataset_tlist

In [7]:
# Dataset3の定義
# args　：tensor of section, tensor of price, tensor of trend(n+1)
# return：dataset3{section, src, target}
class CreateDataset3(Dataset):
    def __init__(self, x, y, z):
        self.x = x # tensor of section
        self.y = y # tensor of price
        self.z = z # tensor of trend(n+1)
        
    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        section = self.x[index]
        src     = self.y[index]
        target  = self.z[index]

        return {'section': section,
                'src'    : src,
                'target' : target}

In [8]:
# Dataset4の定義
# args　：tlist
# return：dataset4 (classのclass)
class CreateDataset4(Dataset):
    def __init__(self, x):
        self.x = x # tensor of section

    # len(Dataset)で返す値を指定
    def __len__(self):
        return len(self.x)

    # Dataset[index]で返す値を指定
    def __getitem__(self, index):
        data = self.x[index]

        return data

In [9]:
# tensor_df()内の関数
max_len = 128
tokenizer = get_tokenizer('basic_english')
def tokenize(text):
    return tokenizer(text)

def txt_to_ids(tokenized_text):
    ids  = torch.tensor([vocab[word] for word in tokenized_text], dtype=torch.long).unsqueeze(0) # [seq_len]
    ids  = F.pad(ids, (0 ,max_len-len(tokenized_text)), "constant", 0) # [max_len]
    return ids

def ids_to_mask(ids):
    mask = (ids==0)
    return mask

In [10]:
# return df[section.ids,mask]
def tensor_df(df):
    print('1.text to ids...')
    df['tweet(n)'] = df['tweet(n)'].apply(tokenize)
    df['ids']      = df['tweet(n)'].apply(txt_to_ids)
    print('2.ids to mask...')
    df['mask']     = df['ids'].apply(ids_to_mask)
    df = df.drop(columns=['tweet(n)'])
    
    return df

In [11]:
# return : df[section,ids(n),mask(n)]
# sectionごとにtensor連結
def separate_section(df):
    section_list = []
    ids_list  = []
    mask_list = []
    for i in range (0,num_section):
        #pandasのSectionkを抽出　idsとmaskをひとまとまりのtensorに
        section_list.append(i)
        df_ids = df[df['section'] == i]['ids']
        l = df_ids.values.tolist()
        x = torch.cat(l, dim=0)
        ids_list.append(x)
        df_mask = df[df['section'] == i]['mask']
        l = df_ids.values.tolist()
        x = torch.cat(l, dim=0)
        mask_list.append(x)
    
    df = pd.DataFrame(list(zip(section_list,ids_list,mask_list)), columns = ['section','ids','mask'])
    
    return df        

In [12]:
# Datasetの作成 (ツイート)
# 1. df to tensor
# 2. separate_section
def data_process1(timespan,n):
    print('-'*5 + 'Create dataset_tlist start!!' + '-'*5)

    print('Reading...')
    dfs = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_s.csv')
    df = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_t.csv')
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)
    
    print('df to tensor...')
    df = tensor_df(df)
    
    print('Separating Section...')
    df = separate_section(df)
        
    train_tdf, test_tdf = train_test_split(df, test_size = 1/7, shuffle=False)
    
    print('Finish!!')
    print(f'{len(train_tdf)=}')
    print(f'{len(test_tdf)=}')
    
    return train_tdf, test_tdf

In [13]:
# Datasetの作成 (価格)
# 1.csv -> 3 tensor
# 2.CreateDataset3
def data_process2(timespan,n):
    print('-'*5 + 'Create dataset_plist start!!' + '-'*5)

    print('Reading...')
    df  = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_b.csv')
    dfs = pd.read_csv(f'tweet-transformer/{timespan}/2021-17_s.csv')

    # 説明変数、目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)
    df['end_price(n)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            df[f'end_price(n-{i})'] = df['end_price(n)'].shift(i)
    df = df.drop(columns=['open_price(n)'])
    df = df.dropna(how='any')
    df = df.reset_index(drop=True)

    # マージして欠損値を含む行を処理
    df = pd.merge(dfs, df, on="section", how = 'left')
    df = df.reset_index(drop=True)
    df['bool'] = df.isnull().any(axis=1)
    for i in range(0, len(df)):
        if  df['bool'][i] == True:
            df['section'][i] = -1
    df = df.fillna(-1)
    df = df.drop(columns=['bool'])
    print(f'{len(df)=}')

    train_df, test_df = train_test_split(df, test_size = 1/7, shuffle=False)

    # 3つのtensorを作成
    print('Creating Dataset3...')
    section = torch.tensor(train_df['section'].values)
    target  = torch.tensor(train_df['trend(n+1)'].values)
    price   = torch.tensor(train_df.drop(columns=['trend(n+1)','section']).values)
    train_plist = CreateDataset3(section, price, target)

    section = torch.tensor(test_df['section'].values)
    target  = torch.tensor(test_df['trend(n+1)'].values)
    price   = torch.tensor(test_df.drop(columns=['trend(n+1)','section']).values)
    test_plist = CreateDataset3(section, price, target)


    print('Finish!!')
    print(f'{len(train_plist)=}')
    print(f'{len(test_plist)=}')
    
    del df,dfs,section,price,target,train_df,test_df
    gc.collect()
    
    return train_plist, test_plist

In [None]:
# データセット作成 & csvに保存
# ツイート
tlist = ['1d','12h','4h','1h','30m','15m','5m']
nlist = [1,2,3,4,5,6,7,8,9,10]
train_section_list     = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
test_section_list      = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}

for timespan, n in itertools.product(tlist, nlist):
    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)
    num_section = train_section_list[timespan] + test_section_list[timespan]
    train_tdf, test_tdf = data_process1(timespan,n)
    print(f'{sys.getsizeof(train_df)=}')
    print(f'{sys.getsizeof(test_df)=}')
    train_df.to_csv(f'../../external_drive/pandas/{timespan}/train_tdf_{n}.csv', index=False)
    test_df.to_csv(f'../../external_drive/pandas/{timespan}/test_tdf_{n}.csv', index=False) 

In [None]:
# データセット作成 & pickleに保存
# 価格
'''
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']

for timespan, n in itertools.product(tlist, nlist):   

    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)
    train_plist,test_plist = data_process2(timespan,n)
    write_pickle(f'../../external_drive/pickle/{timespan}/train_plist_{n}.pickle',train_plist)
    write_pickle(f'../../external_drive/pickle/{timespan}/test_plist_{n}.pickle',test_plist)
    
    del train_plist, test_plist
    gc.collect()


In [None]:
# train,test_section_b
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
batch_size_list        = {'1d':4,  '12h':8,  '4h':16,  '1h':32,  '30m':64,  '15m':256,  '5m':512}
train_section_list     = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
test_section_list      = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}
train_num_batches_list = {'1d':46, '12h':46, '4h':69,  '1h':137, '30m':137, '15m':69,   '5m':103}
test_num_batches_list  = {'1d':8,  '12h':8,  '4h':12,  '1h':23,  '30m':23,  '15m':12,   '5m':18}

for timespan in tlist:

    train_tlist = read_pickle(f'../../external_drive/pickle/{timespan}/train_tlist.pickle')
    test_tlist  = read_pickle(f'../../external_drive/pickle/{timespan}/test_tlist.pickle')
    train_num_batches = train_num_batches_list[timespan]
    test_num_batches = test_num_batches_list[timespan]
    batch_size = batch_size_list[timespan]
    
    for n in nlist:
        for i in range(0,train_num_batches):
            if i*batch_size-n+1 >= 0:
                # plistのn区間分すべてのsection!=1ならば
                if i != (train_num_batches-1):
                    train_tlist_b = train_tlist[i*batch_size-n+1: i*batch_size+batch_size]
                else:
                    train_tlist_b = train_tlist[i*batch_size-n+1: train_num_batches]
                write_pickle(f'../../external_drive/pickle/{timespan}/train_batch_{i}.pickle', train_tlist_b)

        for i in range(0,test_num_batches):
            if i*batch_size-n+1 >= 0:
                # plistのn区間分すべてのsection!=1ならば
                if i != (test_num_batches-1):
                    test_tlist_b = test_tlist[i*batch_size-n+1: i*batch_size+batch_size]
                else:
                    test_tlist_b = test_tlist[i*batch_size-n+1: test_num_batches]
                write_pickle(f'../../external_drive/pickle/{timespan}/test_batch_{i}.pickle', test_tlist_b)

In [11]:
# parametator for Net
ntokens = len(vocab)  # size of vocabulary
d_model = 512   # embedding dimension
nhead   = 8     # number of heads in nn.MultiheadAttention
d_hid   = 2048  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6     # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.2   # dropout probability
lstm_input_dim  = 16
lstm_hidden_dim = 1

In [12]:
# Transformer-LSTMモデルの概要
class Transformer(nn.Module):

    def __init__(self,
                 ntoken: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 dropout: float = 0.5):

        super().__init__()
        self.model_type = 'Transformer'
        self.d_model = d_model
        self.embedding = nn.Embedding(ntoken,
                                d_model)
        self.pos_encoder = PositionalEncoding(d_model,
                                              dropout)
        encoder_layers = TransformerEncoderLayer(d_model,
                                                 nhead,
                                                 d_hid,
                                                 dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers,
                                                      nlayers)
        self.dense = nn.Linear(d_model,3)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.dense.bias.data.zero_()
        self.dense.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, ids, mask):

        # Transformerによるテキストの3値分類           
        x = self.embedding(ids) * math.sqrt(self.d_model) # [batch_size, seq_len, d_model]
        x = self.pos_encoder(x) # [batch_size, seq_len, d_model]
        x = self.transformer_encoder(src=x, src_key_padding_mask=mask) # [batch_size, seq_len, d_model]
        #x = self.transformer_encoder(x) # [batch_size, seq_len, d_model]
        x = x.mean(dim=1) # [batch_size, d_model]
        x = self.dense(x) # [batch_size, 3]

        return x

In [13]:
# Transformer-LSTMモデルの概要
class LSTM(nn.Module):

    def __init__(self,
                 lstm_input_dim: int, 
                 lstm_hidden_dim: int):

        super().__init__()
        self.input_dim = lstm_input_dim
        self.hidden_dim = lstm_hidden_dim
        self.lstm = nn.LSTM(input_size=lstm_input_dim, 
                            hidden_size=lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True)
        self.dense = nn.Linear(lstm_hidden_dim,3)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.dense.bias.data.zero_()
        self.dense.weight.data.uniform_(-initrange, initrange)

    #データの流れ
    def forward(self, train_tlist_b, train_plist_b, device):
            
        # LSTMによるテキスト＋価格の３値分類
        _, x = self.lstm(x)
        print(12, x.size())
        x = self.dense(x[0].view(inlist.size(0), -1))

        return x

In [14]:
# PositionalEncodingの概要
class PositionalEncoding(nn.Module):

    def __init__(self,
                 d_model: int,
                 dropout: float = 0.1,
                 max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        '''
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        '''
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [15]:
# paramator for training & evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
softmax = nn.Softmax(dim=1)
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
torch.manual_seed(0)

<torch._C.Generator at 0x7fd29c0a7fd0>

In [16]:
# Transformerによるテキストの3値分類
def text_classifer(t_start,t_end):
    prob_section = []
        
    # out : list of tesnsor[neg,neu,pos]
    for k in range(t_start,t_end):
        
        # 欠損行のあるBatchはスルー
        break_flag = 0
        for j in range(t_start,t_end):
            if train_plist[j]['section'] == -1:
                break_flag = 1
            if break_frag != 1:        
                tlist = CreateDataset2(train_tdf['ids(n)'][j], train_tdf['mask(n)'][j])
                tbatches = iter(DataLoader(tlist, batch_size=128, shuffle=True))
                p_tbatch = []
                # out : list of tensor[batch_size,3]
                for batch in tbatches:
                    ids  =  batch['ids'].to(device)  # [batch_size, seq_len]
                    mask =  torch.t(batch['mask']).to(device) # [seq_len, batch_size]
                    text_class = model1(ids, mask)     
                    prob = softmax(text_class) # [batch_size, 3]
                    p_tbatch.append(prob.to('cpu'))

            x = torch.cat(p_tbatch, dim=0)
            print(6, x.size()
            x = x.sum(dim=0)
            print(7, x.size())
            prob_section.append(x)

    return prob_section

In [18]:
def create_tensor(p_section):
    #train_plist_b[j] のtensorと　p_sectionのtensorを結合
    src    = []
    target = []
    for j in range(0, batch_size):
        if train_plist_b[j]['section'] != -1:
            x = torch.cat(p_section[j:j+n], dim=-1)
            print(8, x.size())
            x = torch.cat((x,train_plist_b[j]['src']), dim=-1)
            print(9, x.size())
            src.append(x)
            src.append(train_plist_b[j]['target'])

    x = torch.cat(src, dim=0).to(device)
    y = torch.cat(target, dim=-1).to(device)
    print(10, x.size())
    print(11, y.size())

    return x, y

In [24]:
# training
def train(model1,model2, timespan, n):
    model1.train()
    model2.train()

    train_tdf = pd.read_csv(f'../../external_drive/pandas/{timespan}/train_tdf_{n}.csv')
    train_plist = read_pickle(f'../../external_drive/pickle/{timespan}/train_plist_{n}')
    
    log_interval = math.ceil(train_num_batches/30)*10
    batch_counter = 0
    train_loss = 0
    train_correct = 0
    train_count = 0
    
    batch_start_time = time.time()
    
    for i in range(0, train_num_batches):
        if i*batch_size-n+1 >= 0:
            if i != (train_num_batches-1):
                t_start = i*batch_size-n+1
                t_end   = i*batch_size+batch_size
            else:
                t_start = i*batch_size-n+1
                t_end   = train_num_batches
            
                prob_section = text_classifer(model1, train_tlist_b, train_plist_b)
                src, targets = create_tensor(prob_section)
                predictions  = model2(src.to(device))
                prob = softmax(predictions)
                loss = criterion(predictions, targets)

                correct = prob.argmax(axis=1) == targets
                acc = correct.sum().item() / correct.size(0)

                train_correct += correct.sum().item()
                train_count += correct.size(0)
                train_loss += loss.item()

                optimizer1.zero_grad()
                optimizer2.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer1.step()
                optimizer2.step()

        batch_counter += 1

        if batch_counter % log_interval == 0 or batch_counter == num_batches:
            lr = scheduler.get_last_lr()[0]
            s_per_batch = (time.time() - batch_start_time) / log_interval
            cur_loss = train_loss / log_interval
            cur_acc = train_correct / train_count
            print(f'| epoch {epoch:3d} | {batch_counter:5d}/{num_batches:5d} batches | '
                  f'lr {lr:1.5f} | s/batch {s_per_batch:5.2f} | '
                  f'loss {cur_loss:5.3f} | accuracy {cur_acc:8.3f}')
            total_loss = 0
                    batch_start_time = time.time()

                del train_tlist_b,train_plist_b
                gc.collect()

In [20]:
# evaluation (val, test)
def evaluate(model1, model2, timespan, n):
    model1.eval()
    model2.eval()
    test_tlist = read_pickle(f'../../external_drive/pickle/{timespan}/test_tlist.pickle')
    test_plist = read_pickle(f'../../external_drive/pickle/{timespan}/test_plist_{n}.pickle')    
    
    eval_loss = 0
    eval_correct = 0
    eval_count = 0

    with torch.no_grad():
        for i in range(0, test_num_batches):
            test_tlist_b = test_tlist[i: i+batch_size+n-1]
            if i != (num_batches-1):
                test_plist_b = test_plist[i*batch_size: (i+1)*batch_size]
            else:
                test_plist_b = test_plist[i*batch_size: test_section]
            
            
                
            src, targets = text_classifer(model1, train_tlist_b, train_plist_b)
            predictions  = model2(src.to(device))

            prob = softmax(predictions)
            loss = criterion(predictions, targets)

            correct = prob.argmax(axis=1) == targets
            eval_acc = correct.sum().item() / correct.size(0)

            eval_correct += correct.sum().item()
            eval_count += correct.size(0)
            eval_loss += loss.item()
            
            del test_plist_b, test_tlist_b
            gc.collect()
        
    print(f'| loss {eval_loss:5.3f}| accuracy {eval_acc:8.3f} ')
     
    return eval_loss, eval_acc

In [None]:
# main
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
batch_size_list        = {'1d':4,  '12h':8,  '4h':16,  '1h':32,  '30m':64,  '15m':256,  '5m':512}
train_num_section_list = {'1d':181,'12h':363,'4h':1090,'1h':4361,'30m':8722,'15m':17444,'5m':52333}
test_num_section_list  = {'1d':31, '12h':61, '4h':182, '1h':727, '30m':1454,'15m':2908, '5m':8723}
train_num_batches_list = {'1d':46, '12h':46, '4h':69,  '1h':137, '30m':137, '15m':69,   '5m':103}
test_num_batches_list  = {'1d':8,  '12h':8,  '4h':12,  '1h':23,  '30m':23,  '15m':12,   '5m':18}

aculist = {}

for timespan, n in itertools.product(tlist, nlist):

    print('-'*50 + f'{timespan=} / {n=}' + '-'*50)
    train_num_section = train_num_section_list[timespan]
    test_num_section  = test_num_section_list[timespan]
    train_num_batches = train_num_batches_list[timespan]
    test_num_batches  = test_num_batches_list[timespan]
    batch_size=batch_size_list[timespan]

    lr = 1e-3
    model1 = Transformer(ntokens, d_model, nhead, d_hid, nlayers, dropout).to(device)
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=lr)
    scheduler1 = torch.optim.lr_scheduler.StepLR(optimizer1, 1.0, gamma=0.95)
    model2 = LSTM(lstm_input_dim, lstm_hidden_dim).to(device)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr)
    scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer2, 1.0, gamma=0.95)
    best_val_loss = float('inf')
    epochs = 1
    best_model = None

    dt_start = datetime.datetime.now()
    print(datetime.datetime.now())
    print('*'*45 + 'training start' + '*'*45)

    # training & test roop
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        train(model1, model2, timespan, n)
        val_loss, val_acc = evaluate(model1, model2, timespan, n)

        print('-' * 95)
        print(f'| end of epoch {epoch:3d} | time: {time.time()-epoch_start_time:5.2f}s | '
              f'val loss：{val_loss:5.3f} | val accuracy：{val_acc:8.3f}')
        print('-' * 95)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model1 = copy.deepcopy(model1)
            best_model2 = copy.deepcopy(model2)

        scheduler1.step()
        scheduler2.step()

        del epoch_start_time, val_loss, val_acc
        gc.collect()

    dt_end = datetime.datetime.now()
    elapsed = dt_end - dt_start
    print(datetime.datetime.now())    
    print('*'*30 + f'Finish! training time：{elapsed:8.2f}s' + '*'*30)

    # test
    test_loss, test_acc = evaluate(best_model1, best_model2, timespan, n)
    print('=' * 89)
    print(f'| End of training | test loss：{test_loss:5.3f} | '
          f'test accuracy：{test_acc:8.3f}')
    print('=' * 89)

In [18]:
#device = 'cpu'
device = torch.device("cuda:1")
torch.cuda.is_initialized()
torch.cuda.ipc_collect()
torch.cuda.empty_cache()
gc.collect()
print(f'{sys.getsizeof(l)=}')

22

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device0 = torch.device("cuda:0")
device1 = torch.device("cuda:1")
print(torch.cuda.is_available())
print(device)

True
cuda


In [12]:
l=[1,2,3,4,5,6,7,8,9]

print(f'{sys.getsizeof(iter(l))=}')

sys.getsizeof(l)=128
sys.getsizeof(iter(l))=48


In [None]:
    #        for k in range(0,len(train_tlist_b)):
    #            tbatches = DataLoader(train_tlist_b[k], batch_size=128, shuffle=True)
    #            tbatches = iter(tbatches)
    #            l.append(tbatches)
    #        l = iter(l)

In [None]:
a  = torch.randn(256,128)
print(a)
print(a.size())
a = DataLoader(a, batch_size=128, shuffle=True)
print(a)

In [None]:
for batch in a:
    print(batch)
    print(batch.size())

In [14]:
timespan = '1d'
train_tlist = read_pickle(f'../../external_drive/pickle/{timespan}/train_tlist.pickle')    
print(f'{sys.getsizeof(train_tlist)=}')

reading pickle from "../../external_drive/pickle/1d/train_tlist.pickle" ...
end of reading 130.92 s
sys.getsizeof(train_tlist)=48


In [15]:
print(f'{sys.getsizeof(train_df)=}')
print(train_df.head())

sys.getsizeof(train_df)=20416
                                                 ids  \
0  [[tensor(15), tensor(219), tensor(18), tensor(...   
1  [[tensor(15), tensor(4339), tensor(4315), tens...   
2  [[tensor(431), tensor(1425), tensor(5), tensor...   
3  [[tensor(24), tensor(141004), tensor(4821), te...   
4  [[tensor(1238), tensor(121), tensor(2631), ten...   

                                                mask  
0  [[tensor(15), tensor(219), tensor(18), tensor(...  
1  [[tensor(15), tensor(4339), tensor(4315), tens...  
2  [[tensor(431), tensor(1425), tensor(5), tensor...  
3  [[tensor(24), tensor(141004), tensor(4821), te...  
4  [[tensor(1238), tensor(121), tensor(2631), ten...  


In [16]:
print(f'{sys.getsizeof(test_df)=}')
print(test_df.head())

sys.getsizeof(test_df)=3616
                                                 ids  \
0  [[tensor(32652), tensor(11), tensor(18963), te...   
1  [[tensor(32652), tensor(11), tensor(18963), te...   
2  [[tensor(129), tensor(1733), tensor(151925), t...   
3  [[tensor(5110), tensor(15), tensor(279606), te...   
4  [[tensor(175809), tensor(7), tensor(7), tensor...   

                                                mask  
0  [[tensor(32652), tensor(11), tensor(18963), te...  
1  [[tensor(32652), tensor(11), tensor(18963), te...  
2  [[tensor(129), tensor(1733), tensor(151925), t...  
3  [[tensor(5110), tensor(15), tensor(279606), te...  
4  [[tensor(175809), tensor(7), tensor(7), tensor...  


In [17]:
write_pickle(f'../../external_drive/pandas/train_df.pickle', train_df)
write_pickle(f'../../external_drive/pandas/test_df.pickle', test_df)

writing pickle to "../../external_drive/pandas/train_df.pickle" ...
end of writeing  66.08 s
writing pickle to "../../external_drive/pandas/test_df.pickle" ...
end of writeing  34.67 s


In [18]:
train_df.to_csv(f'../../external_drive/pandas/train_df.csv', index=False)
test_df.to_csv(f'../../external_drive/pandas/test_df.csv', index=False)

In [20]:
start=time.time()
#train_df = pd.read_csv(f'../../external_drive/pandas/train_df.csv')
train_df =read_pickle(f'../../external_drive/pandas/train_df.pickle')    
print(time.time()-start)

reading pickle from "../../external_drive/pandas/train_df.pickle" ...
end of reading  11.12 s
11.199097394943237


In [23]:
train_tlist = CreateDataset2(train_df['ids'],train_df['mask'])
write_pickle(f'../../external_drive/pandas/train_tlist.pickle', train_tlist)

writing pickle to "../../external_drive/pandas/train_tlist.pickle" ...
end of writeing  73.27 s


In [28]:
j = 20
tlist = CreateDataset2(train_df['ids'][j], train_df['mask'][j])
tbatches = DataLoader(tlist, batch_size=128, shuffle=True)

In [32]:
i=1
for batch in iter(tbatches):
    print(batch['mask'])
    print(batch['mask'].size())
    mask = (batch['mask']==0)
    print(mask)
    i+=1
    if i==2:break

tensor([[  932, 27084,  4634,  ...,     0,     0,     0],
        [  553,    37,   336,  ...,     0,     0,     0],
        [  105,  3392,   277,  ...,     0,     0,     0],
        ...,
        [  585,    15,   219,  ...,     0,     0,     0],
        [   75,    44,    57,  ...,     0,     0,     0],
        [16977,  4355,  1502,  ...,     0,     0,     0]])
torch.Size([128, 128])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]])
