In [1]:
proj_list = [
    'boringssl_total', 'c-ares_total',
    'freetype2_total', 'guetzli_total',
    'harfbuzz_total', 'libpng_total',
    'libssh_total', 'libxml2_total',
    'pcre_total', 'proj4_total',
    're2_total', 'sqlite3_total',
    'total', 'vorbis_total',
    'woff2_total', 'wpantund_total'
]

In [2]:
from sklearn.model_selection import train_test_split
import torch

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import data
import data_loader as dl
import initializer as init
import trainer
import tester
import predictor
import model_util as mu
import pretrained_model as pm

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
prefix_np, postfix_np, label_np = data.getSingleProjectData(proj_list, proj_list[0])

In [None]:
train_prefix, test_prefix, train_postfix, test_postfix, train_label, test_label = train_test_split(
    prefix_np, postfix_np, label_np, test_size = 0.1, random_state = 43
)

train_prefix, val_prefix, train_postfix, val_postfix, train_label, val_label = train_test_split(
    train_prefix, train_postfix, train_label, test_size = 0.1, random_state = 43
)

In [None]:
train_dataloader, val_dataloader, test_dataloader =\
    dl.data_loader(
        train_prefix, train_postfix,
        val_prefix, val_postfix,
        test_prefix, test_postfix,
        train_label, val_label, test_label
    )

In [None]:
# PyTorch TensorBoard support
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('../tensorboard/dev/tests')

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3070


In [4]:
# ====================
# set parameters here
# ====================

title = 'dev-test1'
epochs = 10

embed_dim = 50
max_len, source_code_tokens, token_choices = data.getInfo()
pretrained_token2vec = pm.load_pretrained_model(source_code_tokens, embed_dim)
pretrained_token2vec = torch.tensor(pretrained_token2vec)


input_size = max_len
hidden_size = 30
num_classes = max(token_choices) + 1
rnn_layers = 1

num_filters = [100, 200, 100]
kernel_sizes = [15, 21, 114]

dropout = 0.0

learning_rate = 0.001
weight_decay = 1e-4

model_name = "RNN"
optim_name = "Adam"
loss_fn_name = "CEL"

pretrained_model = pretrained_token2vec
freeze_embedding = False,

In [None]:
trainer.set_seed(42)

model, optimizer, loss_fn = init.initialize_model(
    vocab_size=input_size,
    embed_dim=embed_dim,
    hidden_size=hidden_size,
    num_classes=num_classes,
    rnn_layers=rnn_layers,
    num_filters=num_filters,
    kernel_sizes=kernel_sizes,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    model_name=model_name,
    optim_name=optim_name,
    loss_fn_name=loss_fn_name,
    pretrained_model=pretrained_model,
    freeze_embedding=freeze_embedding,
    device=device,
)

print(model)

In [None]:
trainer.train(
    epochs=epochs,
    title=title,
    writer=writer,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn
)

In [None]:
mu.saveModel(title, model)

In [5]:
model = mu.getModel('dev-test1')
print(model)

C_rnn(
  (emb): Embedding(213, 50)
  (lstm1): RNN(
    (rnn): LSTM(50, 30, batch_first=True, bidirectional=True)
  )
  (lstm2): RNN(
    (rnn): LSTM(50, 30, batch_first=True, bidirectional=True)
  )
  (fc1): Linear(in_features=120, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=213, bias=True)
  (dp1): Dropout(p=0.0, inplace=False)
  (dp2): Dropout(p=0.0, inplace=False)
)


In [None]:
tester.test(test_dataloader=test_dataloader,
            device=device,
            model=model,
            title=title)

In [None]:
mu.graphModel(train_dataloader, model, writer)

In [6]:
prefix =[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
postfix = [2, 91, 2, 56, 2, 106, 47, 2, 134, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 91, 2, 119, 91, 2, 56, 2, 106, 47, 2, 119, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 91, 2, 119, 91, 2, 56, 2, 106, 47, 2, 119, 128, 50, 88, 33, 124, 49, 48, 134, 47, 2, 119]
label_type = 128

In [8]:
prefix =[0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 119, 2, 47, 134, 48, 49, 124, 33, 88, 50, 128, 119, 2, 47, 106, 2, 56, 2, 91, 119, 2, 91, 2, 56, 2, 48, 49, 2, 47, 48, 88, 50, 128, 119, 2, 47, 106, 2, 56, 2, 91, 119, 2, 91, 2, 56, 2, 48, 49, 2, 47, 48, 88, 50]
postfix = [48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134, 128, 50, 88, 48, 47, 2, 49, 48, 2, 56, 2, 106, 91, 2, 56, 2, 91, 2, 56, 2, 106, 47, 2, 134]
label_type = 128

In [9]:
predictor.predict(prefix, postfix, model)

#1) 128: 82.90623474121094%
#2) 50: 13.419479370117188%
#3) 118: 0.6971883177757263%
#4) 88: 0.46470627188682556%
#5) 131: 0.32658690214157104%
#6) 212: 0.22190125286579132%
#7) 133: 0.22031375765800476%
#8) 134: 0.17750635743141174%
#9) 113: 0.16408100724220276%
#10) 115: 0.15977948904037476%
