In [22]:
proj_list = [
    'boringssl_total', 'c-ares_total',
    'freetype2_total', 'guetzli_total',
    'harfbuzz_total', 'libpng_total',
    'libssh_total', 'libxml2_total',
    'pcre_total', 'proj4_total',
    're2_total', 'sqlite3_total',
    'total', 'vorbis_total',
    'woff2_total', 'wpantund_total'
]

In [23]:
target_project = 0

In [24]:
from sklearn.model_selection import train_test_split
import torch
import torch, gc

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import timeit

import data
import data_loader as dl
import initializer as init
import trainer
import tester
import predictor
import model_util as mu
import pretrained_model as pm

import os

In [25]:
gc.collect()
torch.cuda.empty_cache()

print(torch.cuda.memory_summary(device=None, abbreviated=False))

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   33272 KB |    3171 MB |  151500 GB |  151500 GB |
|       from large pool |   18285 KB |    3152 MB |  151011 GB |  151011 GB |
|       from small pool |   14987 KB |      21 MB |     488 GB |     488 GB |
|---------------------------------------------------------------------------|
| Active memory         |   33272 KB |    3171 MB |  151500 GB |  151500 GB |
|       from large pool |   18285 KB |    3152 MB |  151011 GB |  151011 GB |
|       from small pool |   14987 KB |      21 MB |     488 GB |     488 GB |
|---------------------------------------------------------------

In [26]:
prefix_np, postfix_np, label_np = data.getSingleProjectData(proj_list, proj_list[target_project])

Getting data for "boringssl_total" from "c-ares_total"
Getting data for "boringssl_total" from "freetype2_total"
Getting data for "boringssl_total" from "guetzli_total"
Getting data for "boringssl_total" from "harfbuzz_total"
Getting data for "boringssl_total" from "libpng_total"
Getting data for "boringssl_total" from "libssh_total"
Getting data for "boringssl_total" from "libxml2_total"
Getting data for "boringssl_total" from "pcre_total"
Getting data for "boringssl_total" from "proj4_total"
Getting data for "boringssl_total" from "re2_total"
Getting data for "boringssl_total" from "sqlite3_total"
Getting data for "boringssl_total" from "vorbis_total"
Getting data for "boringssl_total" from "woff2_total"
Getting data for "boringssl_total" from "wpantund_total"


In [27]:
test_prefix, test_postfix, test_label = data.getTestData(proj_list[target_project])

In [28]:
train_prefix, val_prefix, train_postfix, val_postfix, train_label, val_label = train_test_split(
    prefix_np, postfix_np, label_np, test_size = 0.2, random_state = 43
)

# train_prefix, val_prefix, train_postfix, val_postfix, train_label, val_label = train_test_split(
#     train_prefix, train_postfix, train_label, test_size = 0.2, random_state = 43
# )

In [29]:
train_dataloader, val_dataloader, test_dataloader =\
    dl.data_loader(
        train_prefix, train_postfix,
        val_prefix, val_postfix,
        test_prefix, test_postfix,
        train_label, val_label, test_label,
        batch_size=1000
    )

In [30]:
# PyTorch TensorBoard support
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('../tensorboard/real_OVF_9/tests')

In [31]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3070


In [32]:
# ====================
# set parameters here
# ====================

title = proj_list[target_project] + '9_real_OVF_8'
epochs = 40

embed_dim = 128
max_len, source_code_tokens, token_choices = data.getInfo()
pretrained_token2vec = pm.load_pretrained_model(source_code_tokens, embed_dim)
pretrained_token2vec = torch.tensor(pretrained_token2vec)


input_size = max_len
hidden_size = 200
num_classes = max(token_choices) + 1
rnn_layers = 2

num_filters = [100, 200, 100]
kernel_sizes = [15, 21, 114]

dropout = 0.2

learning_rate = 0.001
# weight_decay = 1e-4
weight_decay = 0

model_name = "RNN"
optim_name = "Adam"
loss_fn_name = "CEL"

pretrained_model = pretrained_token2vec
freeze_embedding = False,

In [33]:
trainer.set_seed(42)

model, optimizer, loss_fn = init.initialize_model(
    vocab_size=input_size,
    embed_dim=embed_dim,
    hidden_size=hidden_size,
    num_classes=num_classes,
    rnn_layers=rnn_layers,
    num_filters=num_filters,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    model_name=model_name,
    optim_name=optim_name,
    loss_fn_name=loss_fn_name,
    pretrained_model=pretrained_model,
    freeze_embedding=freeze_embedding,
    device=device,
)

print(model)

doing with pretrained model!!!
C_rnn(
  (emb): Embedding(213, 128)
  (lstm1): RNN(
    (rnn): LSTM(128, 200, num_layers=2, batch_first=True, bidirectional=True)
  )
  (lstm2): RNN(
    (rnn): LSTM(128, 200, num_layers=2, batch_first=True, bidirectional=True)
  )
  (fc1): Linear(in_features=800, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=213, bias=True)
  (dp1): Dropout(p=0.0, inplace=False)
  (dp2): Dropout(p=0.0, inplace=False)
)


In [34]:
start_time = timeit.default_timer()

trainer.train(
    epochs=epochs,
    title=title,
    writer=writer,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn
)

end_time = (timeit.default_timer() - start_time) / 60.0

Start training...

 Epoch  |  Train Loss  | Train Acc  | Val Loss | Val Acc | Elapsed
--------------------------------------------------------------------------------
   1    |   0.506723   | 84.736997  | 0.237084 | 91.85  | 503.27
   2    |   0.204268   | 92.851030  | 0.183072 | 93.52  | 513.30
   3    |   0.165076   | 94.085280  | 0.159514 | 94.30  | 522.99
   4    |   0.141381   | 94.907802  | 0.143373 | 94.91  | 509.56
   5    |   0.123777   | 95.524632  | 0.134381 | 95.28  | 517.07
   6    |   0.110012   | 95.993474  | 0.127751 | 95.51  | 516.75
   7    |   0.098260   | 96.414377  | 0.123562 | 95.75  | 513.31
   8    |   0.088032   | 96.776055  | 0.123096 | 95.81  | 507.74
   9    |   0.078231   | 97.129342  | 0.123421 | 95.89  | 502.37
  10    |   0.070107   | 97.406869  | 0.123893 | 96.08  | 503.71
  11    |   0.062621   | 97.688224  | 0.125557 | 96.07  | 503.76
  12    |   0.055872   | 97.927870  | 0.128080 | 96.17  | 501.62
  13    |   0.050022   | 98.141217  | 0.132010 | 96.2

In [35]:
mu.saveModel(title, model)

In [36]:
model = mu.getModel(title)
print(model)

C_rnn(
  (emb): Embedding(213, 128)
  (lstm1): RNN(
    (rnn): LSTM(128, 200, num_layers=2, batch_first=True, bidirectional=True)
  )
  (lstm2): RNN(
    (rnn): LSTM(128, 200, num_layers=2, batch_first=True, bidirectional=True)
  )
  (fc1): Linear(in_features=800, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=213, bias=True)
  (dp1): Dropout(p=0.0, inplace=False)
  (dp2): Dropout(p=0.0, inplace=False)
)


In [37]:
loss, acc = tester.test(test_dataloader=test_dataloader,
                        device=device,
                        model=model,
                        title=title)

test loss:  5.2233346480203755
test acc:  77.2922480620155


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


saved precision and recall results to file!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
with open('../result/real_OVF_9', 'a') as f:
        text = title + '\t |\tloss: ' + str(loss) + '\t |\tacc: ' + str(acc) + '\t |\t time: ' + str(round(end_time, 3)) + ' min\n'
        f.write(text)

In [39]:
mu.graphModel(train_dataloader, model, writer)

uploaded model graph to tensorboard!


In [40]:
# prefix =[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# postfix = [2, 91, 2, 56, 2, 106, 47, 2, 134, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 91, 2, 119, 91, 2, 56, 2, 106, 47, 2, 119, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 91, 2, 119, 91, 2, 56, 2, 106, 47, 2, 119, 128, 50, 88, 33, 124, 49, 48, 134, 47, 2, 119]
# label_type = 128

In [41]:
# prefix =[0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 119, 2, 47, 134, 48, 49, 124, 33, 88, 50, 128, 119, 2, 47, 106, 2, 56, 2, 91, 119, 2, 91, 2, 56, 2, 48, 49, 2, 47, 48, 88, 50, 128, 119, 2, 47, 106, 2, 56, 2, 91, 119, 2, 91, 2, 56, 2, 48, 49, 2, 47, 48, 88, 50]
# postfix = [48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134]
# label_type = 128

In [42]:
# predictor.predict(prefix, postfix, model)