## Library & Module import

In [1]:
import torch
import torch.nn as nn
from argparse import Namespace
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import re
import string
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Setting environment variable

In [3]:
args = Namespace(
    perc_train = 0.5,
    perc_val = 0.2,
    perc_test = 0.3,
    perc_vocab = 1,
    dataset = "../deep_learning_data/pr_1MB.cstate",
    dataset_csv = "pr_1MB.csv",
    # dataset_csv = "bert_pf_before.csv",
    seed = 1337,
    lr = 5e-4,
    batch_size = 64,
    num_epoch = 200,
    embedding_size = 64,
    encoding_size = 32,
    cut_off = 1,
    max_len = 128,
    cuda = True,
    device = 'cuda'
)

In [4]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)
    
# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

## Recreate Train Dataset

In [5]:
input_list = list()
output_list = list()
with open(args.dataset, 'r') as dataset:
    line = dataset.readline().split()
    while line:
        strline = [str(dstr) for dstr in line]
        input_list.append(" ".join(strline[:-1]))
        output_list.append(strline[-1])
        line = dataset.readline().split()

In [6]:
final_data = pd.DataFrame(input_list, columns=['pa'])
final_data['label'] = output_list
final_data

Unnamed: 0,pa,label
0,31854774143 31854774142 31854774141 3185477414...,42318532544
1,42318532591 42318532590 42318532589 4231853258...,48286355124
2,48286355135 48286355134 48286355133 4828635513...,23732197760
3,23732197823 23732197822 23732197821 2373219782...,7432544128
4,7432544179 7432544180 7432544178 7432544177 74...,65249514352
...,...,...
4036778,23732197811 23732197809 23732197806 2373219780...,23732197819
4036779,23732197819 23732197811 23732197809 2373219780...,23732197821
4036780,23732197821 23732197819 23732197811 2373219780...,7432544128
4036781,7432544130 7432544129 7432544128 23732197821 2...,7432544135


In [7]:
final_data.to_csv(args.dataset_csv, index=False)

## Load Dataset

In [8]:
dataset_df = pd.read_csv(args.dataset_csv)

In [9]:
dataset_df

Unnamed: 0,pa,label
0,31854774143 31854774142 31854774141 3185477414...,42318532544
1,42318532591 42318532590 42318532589 4231853258...,48286355124
2,48286355135 48286355134 48286355133 4828635513...,23732197760
3,23732197823 23732197822 23732197821 2373219782...,7432544128
4,7432544179 7432544180 7432544178 7432544177 74...,65249514352
...,...,...
4036778,23732197811 23732197809 23732197806 2373219780...,23732197819
4036779,23732197819 23732197811 23732197809 2373219780...,23732197821
4036780,23732197821 23732197819 23732197811 2373219780...,7432544128
4036781,7432544130 7432544129 7432544128 23732197821 2...,7432544135


In [10]:
dataset_df['pa'] = dataset_df['pa'].shift(periods=2, axis=0)
dataset_df = dataset_df.dropna(axis=0)

In [11]:
dataset_df = dataset_df.iloc[:1600000]
dataset_df

Unnamed: 0,pa,label
2,31854774143 31854774142 31854774141 3185477414...,23732197760
3,42318532591 42318532590 42318532589 4231853258...,7432544128
4,48286355135 48286355134 48286355133 4828635513...,65249514352
5,23732197823 23732197822 23732197821 2373219782...,58858693312
6,7432544179 7432544180 7432544178 7432544177 74...,5016497408
...,...,...
1599997,5016497451 5016497441 5016497433 5016497432 50...,40010374578
1599998,40010374563 5016497451 5016497441 5016497433 5...,17690378394
1599999,40010374570 40010374563 5016497451 5016497441 ...,17690378407
1600000,40010374578 40010374570 40010374563 5016497451...,17690378423


In [12]:
dataset_df['label'] = dataset_df['label'].astype(str)

In [13]:
dataset_df = dataset_df.reset_index(drop=True)
dataset_df

Unnamed: 0,pa,label
0,31854774143 31854774142 31854774141 3185477414...,23732197760
1,42318532591 42318532590 42318532589 4231853258...,7432544128
2,48286355135 48286355134 48286355133 4828635513...,65249514352
3,23732197823 23732197822 23732197821 2373219782...,58858693312
4,7432544179 7432544180 7432544178 7432544177 74...,5016497408
...,...,...
1599995,5016497451 5016497441 5016497433 5016497432 50...,40010374578
1599996,40010374563 5016497451 5016497441 5016497433 5...,17690378394
1599997,40010374570 40010374563 5016497451 5016497441 ...,17690378407
1599998,40010374578 40010374570 40010374563 5016497451...,17690378423


### Datatset / DataLoader / Vectorizer / Vocabulary / Model의 흐름에 대한 구성
순서 : Vocabulary -> Vectorizer -> Dataset -> DataLoader -> Model

Raw Data
- 현 시점 cache에 존재하는 주소 목록 
- ex: 125, 158, 154, 134, 145, 341, 133, 136
- 주소를 숫자가 아닌 문자열로 보아야 함 (physical addr는 숫자의 의미보다 문자열의 의미가 더 강하다고 판단)
- 주소를 하나의 토큰으로

Vocabulary 
- 각 주소(토큰)를 정수로 매핑
- 입력 및 출력의 대상이 되는 주소만을 가짐 (+-10정도 커버?)

** Vectorizer ** 
- 매핑된 토큰을 벡터 형태로 변환

Dataset 
- vectorizer 이용하여 구성

DataLoader
- 미니배치 단위로 데이터셋 가져옴

Model
- 초기값: 첫 주소

## Vocabulary

In [14]:
class PAVocabulary(object):
    def __init__(self, token_to_idx = None, add_unk = True,
                 mask_token = "<MASK>", unk_token = "<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._mask_token = mask_token
        self._unk_token = unk_token
        
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(self._unk_token)

    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, cstates):
        return cls(**cstates)
    
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" %index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>"%len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer
- max_len에 대한 조건 없음

In [15]:
class PAVectorizer(object):
    def __init__(self, pa_vocab):
        self.pa_vocab = pa_vocab
        # self.max_la_length = max_pa_length
    
    def _vectorize(self, indices):
        vector_length = len(indices)
        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        return vector
        
    def _get_pa_indices(self, pa_list):
        # print(pa_list)
        # print(type(pa_list))
        # 벡터로 변환된 physical addr list 반환
        indices = [self.pa_vocab.lookup_token(token) for token in pa_list.split(" ")]
        return indices
    
    def vectorize(self, cstate):
        pa_indices = self._get_pa_indices(cstate)
        pa_vector = self._vectorize(indices=pa_indices)
        return {'pa_vector':pa_vector,
                'pa_length':len(pa_indices)}
        
    @classmethod
    def from_dataframe(cls, cstate_df):
        pa_vocab = PAVocabulary()
        pa_counts = Counter()
        for cstate in cstate_df.pa:
            for pa in cstate.split(" "):
                pa_counts[pa] += 1
        for cstate in cstate_df.label:
            pa_counts[cstate] += 1
        
        for pa, count in pa_counts.items():
            if count >= args.cut_off:
                pa_vocab.add_token(pa)
        print("vectorizer vocab len: ",len(pa_vocab))
        return cls(pa_vocab)
    
    @classmethod
    def from_serializable(cls, cstate_dict):
        pa_vocab = PAVocabulary.from_serializable(cstate_dict['pa_vocab'])
        return cls(pa_vocab)
    
    def to_serializable(self):
        return {'pa_vocab': self.pa_vocab.to_serializable()}

## Dataset

In [16]:
class CstateDataset(Dataset):
    def __init__(self, cstate_df, vectorizer):
        self.cstate_df = cstate_df
        self._vectorizer = vectorizer
        
        self.n_total = len(cstate_df)
        
        self.train_size = int(self.n_total * args.perc_train)
        self.train_df = self.cstate_df.loc[:self.train_size]
        
        self.val_size = int(self.n_total * args.perc_val)
        self.val_df = self.cstate_df.loc[self.train_size : self.train_size+self.val_size]
        
        self.test_size = self.n_total - (self.train_size + self.val_size)
        self.test_df = self.cstate_df.loc[self.train_size+self.val_size:]
        
        self.vocab_df_size = self.n_total
        self.vocab_df = self.cstate_df
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.val_size),
                             'test': (self.test_df, self.test_size),
                             'vocab': (self.vocab_df, self.vocab_df_size)}
        
        self.set_split('vocab')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, cstate_df):
        # cstate_df = pd.read_csv(cstate_csv)
        vocab_cstate_df = cstate_df.loc[:int(len(cstate_df)*args.perc_vocab)]
        print("dataset df len:", len(vocab_cstate_df))
        vectorizer = PAVectorizer.from_dataframe(vocab_cstate_df)
        
        return cls(cstate_df, vectorizer)

    @classmethod
    def load_dataset_and_load_vectorizer(cls, cstate_df, vectorizer_filepath):
        vocab_cstate_df = cstate_df
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(vocab_cstate_df, vectorizer)
    
    @classmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return PAVectorizer.from_serializable(json.load(fp))
        
    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
            
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self, split='train'):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        pa_vector = self._vectorizer.vectorize(row.pa)
        label_vector = self._vectorizer.vectorize(row.label)
        
        return {'x_data': pa_vector['pa_vector'],
                'y_target': label_vector['pa_vector'],
                'x_data_length': pa_vector['pa_length']}
        
    def get_num_batches(self, batch_size):
        return len(self) // batch_size

## DataLoader

In [17]:
def generate_batches(dataset, batch_size, shuffle=False, drop_last=True, device='cpu'):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict={}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Model

In [18]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn_fc = nn.Linear(in_features=hidden_size, out_features=1)
    
    def forward(self, x):
        # print(x.shape)
        x = self.attn_fc(x)
        # print(x.shape)
        x = F.softmax(x, dim=1)
        # print(x.shape)
        return x

In [19]:
class Prefetcher(nn.Module):
    def __init__(self, args, vocab_size):
        super(Prefetcher, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=args.embedding_size)
        self.lstm = nn.LSTM(input_size = args.embedding_size,
                            hidden_size=args.encoding_size,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.1)
        self.attention = Attention(args.encoding_size*2)
        self.fc2 = nn.Linear(in_features=args.max_len, out_features=vocab_size)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, x):
        # print(x.shape)
        x = self.embedding(x)
        # print(x.shape)
        output, hidden = self.lstm(x)
        # print(output.shape)
        attn_weights = self.attention(output)
        # print(attn_weights.shape)
        attn_output = output * attn_weights
        # print(attn_output.shape)
        attn_output = torch.sum(attn_output, dim=-1)
        # print(attn_output.shape)
        x = self.fc2(attn_output)
        # print(x.shape)
        x = self.softmax(x)
        # print(x.shape)
        x = x.squeeze()
        # print(x.shape)
        return x

In [20]:
# 현재 사용가능한 디바이스로 환경변수 device 재설정
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(args.device)
# 학습에 사용할 데이터셋 파일 가져와서 Dataset 객체 만들기 
dataset = CstateDataset.load_dataset_and_make_vectorizer(dataset_df)
# dataset 객체를 만들면 안에서 vectorizer 객체도 생성되기 때문에 여기서 vectorizer 뽑아낼 수 있음
vectorizer = dataset.get_vectorizer()

cuda:0
dataset df len: 1600000
vectorizer vocab len:  2369


In [21]:
model = Prefetcher(args, len(vectorizer.pa_vocab)).to(args.device)

In [22]:
len(vectorizer.pa_vocab)

2369

In [23]:
# optimizer = optim.Adam(model.parameters(), lr=args.lr)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
# 학습률을 매 스텝마다 0.5배로 조정
# 아래 학습의 경우, 에폭마다 0.5배하도록 했음 -> 적은데이터 수로 인해 학습속도가 매우 빨라서 매 에폭마다 조절 필요하다고 판단함
# 어차피 매 에폭마다 학습률 줄이는 방향으로 조절할 것이기 때문에 가장 단순한 StepLR 통해 학습률을 조정하고자 했음
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=0)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()


In [24]:
# 학습 중 학습데이터 및 검증데이터에 대한 손실값을 출력하기 위한 변수
epoch_train_loss = 0.0
epoch_val_loss = 0.0
# 학습 중 손실값 혹은 metric 적용에 대한 결과값 등을 기록해두기 위한 리스트
logs=[]
# 학습을 시작할 epoch 지정
start_epoch = 0
weight_file = f'models/model_{start_epoch}.pth'
# pretrained model load
if start_epoch!=0:
    pre_weights = torch.load(weight_file, map_location=args.device)
    model.load_state_dict(pre_weights)
    
for epoch in range(start_epoch, args.num_epoch):
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    # train_log_df = pd.DataFrame()
    dataset.set_split('train')
    train_batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
    total = dataset.get_num_batches(args.batch_size)
    train_iterator = tqdm(train_batch_generator, desc="Training (X / X Steps) (loss=X.X)", dynamic_ncols=True)
    # for batch_idx, batch_dict in enumerate(train_batch_generator):
    for batch_idx, batch_dict in enumerate(train_iterator):
        # if batch_idx >= 24990:
        #     print(batch_idx, ":", batch_dict['x_data'].shape)
        
        optimizer.zero_grad()
        y_pred = model(batch_dict['x_data'].to(args.device))
        # 확률 분포로 변환 (소프트맥스 함수 사용)
        # probabilities = F.softmax(y_pred, dim=1)
        # loss = criterion(y_pred, batch_dict['y_target'].float().to(args.device))
        # 타겟을 정수형으로 변환
        # y_target = batch_dict['y_target'].to(torch.long).to(args.device)

        # Negative Log Likelihood Loss를 사용하여 손실 계산
        # loss = F.nll_loss(torch.log(probabilities), y_target)
        # print(y_pred.shape)
        # print(torch.argmax(y_pred, dim=1))
        # print(y_pred)
        # print(batch_dict['y_target'].shape)
        # print(batch_dict['y_target'])
        loss = criterion(y_pred, batch_dict['y_target'].squeeze().to(args.device))
        epoch_train_loss += loss.item()
        train_iterator.set_description("Training (%d / %d Steps) (loss=%2.5f)" %(batch_idx, total, loss))
        loss.backward()
        optimizer.step()
    # epoch loss log
    # output_train_df = pd.DataFrame({'epoch':epoch+1, 'loss':epoch_train_loss})
    # train_log_df = pd.concat([train_log_df, output_train_df])
    # train_log_df.to_csv("train_log.csv")
    # scheduler.step()

    # validation
    if(epoch+1) % 1 == 0:
        # val_log_df = pd.DataFrame()
        dataset.set_split('val')
        val_batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        val_iterator = tqdm(val_batch_generator, desc="Validation (X / X Steps) (loss=X.X)", dynamic_ncols=True)
        for batch_idx, batch_dict in enumerate(val_iterator):
            with torch.no_grad():
                output = model(batch_dict['x_data'].to(args.device))
                loss = criterion(output, batch_dict['y_target'].squeeze().to(args.device))
                epoch_val_loss += loss.item()
                val_iterator.set_description("Validation (%d / %d Steps) (loss=%2.5f)" %(batch_idx, total, loss))
        # output_val_df = pd.DataFrame({'epoch':epoch+1, 'loss':epoch_val_loss})
        # val_log_df = pd.concat([val_log_df, output_val_df])
        # val_log_df.to_csv("val_log.csv")
    scheduler.step(epoch_val_loss/dataset.get_num_batches(32))
    # scheduler.step()
    log_epoch = {'epoch':epoch+1, 'train_loss':epoch_train_loss, 'val_loss':epoch_val_loss}
    logs.append(log_epoch)
    log_df = pd.DataFrame(logs)
    log_df.to_csv("log_output.csv")
    
    # model save
    if(epoch+1) % 5 == 0:
        torch.save(model.state_dict(), 'models/model_'+str(epoch+1)+'.pth')
    

  x = self.softmax(x)
Training (1 / 12500 Steps) (loss=7.77438): : 0it [00:00, ?it/s]

Training (12499 / 12500 Steps) (loss=6.29466): : 12500it [12:27, 16.72it/s]
Validation (4999 / 12500 Steps) (loss=5.51433): : 5000it [02:39, 31.29it/s]
Training (12499 / 12500 Steps) (loss=5.10047): : 12500it [12:24, 16.79it/s]
Validation (4999 / 12500 Steps) (loss=5.17477): : 5000it [02:40, 31.24it/s]
Training (12499 / 12500 Steps) (loss=4.49891): : 12500it [12:19, 16.90it/s]
Validation (4999 / 12500 Steps) (loss=5.14660): : 5000it [02:40, 31.06it/s]
Training (12499 / 12500 Steps) (loss=4.21748): : 12500it [12:21, 16.87it/s]
Validation (4999 / 12500 Steps) (loss=5.06553): : 5000it [02:40, 31.11it/s]
Training (12499 / 12500 Steps) (loss=4.15906): : 12500it [12:22, 16.83it/s]
Validation (4999 / 12500 Steps) (loss=5.01620): : 5000it [02:40, 31.07it/s]
Training (12499 / 12500 Steps) (loss=4.13451): : 12500it [12:27, 16.72it/s]
Validation (4999 / 12500 Steps) (loss=4.97807): : 5000it [02:41, 30.94it/s]
Training (12499 / 12500 Steps) (loss=4.05425): : 12500it [12:24, 16.79it/s]
Validation (

KeyboardInterrupt: 

## Test

In [25]:
model.load_state_dict(torch.load('models/model_20.pth'))
model = model.to(args.device)
result_df = pd.DataFrame()
dataset.set_split('test')
test_batch_generator = generate_batches(dataset, batch_size=1, device=args.device)
test_iterator = tqdm(test_batch_generator, desc="Test (X / X Steps)", dynamic_ncols=True)
test_total = dataset.get_num_batches(1)
correct = 0
with torch.no_grad():
    # for _, data in enumerate(test_batch_generator):
    for idx, data in enumerate(test_iterator):
        output = model(data['x_data'].to(args.device))
        topk_vals, topk_indices = torch.topk(output, 20)
        toplist = topk_indices.detach().cpu().tolist()
        target = data['y_target'].squeeze().cpu().detach().numpy()
        output_df = pd.DataFrame({'pred':[toplist], 'target':target})
        result_df = pd.concat([result_df, output_df])
        if target in toplist:
            correct+=1
        test_iterator.set_description("Test (%d / %d Steps)" %(idx, test_total))
print(correct)
print(correct/test_total*100)
result_df.to_csv('test_result.csv', index=False)

  x = self.softmax(x)
Test (0 / 480000 Steps): : 0it [00:00, ?it/s]

Test (479999 / 480000 Steps): : 480000it [2:35:27, 51.46it/s]


203312
42.35666666666666


In [None]:
topk_vals.detach().cpu()

tensor([-3.2905, -3.5925, -3.7852, -3.7986, -3.8506, -3.8742, -4.0041, -4.0068,
        -4.0105, -4.0223])

In [None]:
topk_indices

tensor([6231, 6180, 6182, 6444, 6103, 5933, 6216, 6226, 6262, 6081],
       device='cuda:0')

In [26]:
cor_10 = 0
for idx, row in result_df.iterrows():
    if row.target in row.pred[:10]:
        cor_10+=1
    

In [27]:
cor_10 / len(result_df) * 100

25.130000000000003