In [1]:
# version2
proj_list = [
    'boringssl', 'c-ares',
    'freetype2', 'guetzli',
    'harfbuzz', 'lcms',
    'libarchive', 'libpng',
    'libssh', 'libxml2',
    'pcre2', 'proj4',
    're2', 'sqlite3',
    'vorbis', 'woff2',
    'wpantund'
]

# version3
proj_list = [
    'boringssl', 'c-ares',
    'freetype2', 'guetzli',
    'harfbuzz', 'lcms',
    'libpng', 'libssh',
    'libxml2', 'pcre2',
    'proj4', 're2',
    'sqlite3', 'vorbis',
    'woff2', 'wpantund'
]

version = 'version4'
proj_list = [
    'total_aspell', 'total_boringssl', 'total_c-ares', 'total_exiv2',
    'total_freetype2', 'total_grok', 'total_guetzli', 'total_harfbuzz',
    'total_lcms', 'total_libarchive', 'total_libexif', 'total_libhtp',
    'total_libpng', 'total_libsndfile', 'total_libssh', 'total_libxml2',
    'total_ndpi', 'total_openthread', 'total_pcre2', 'total_proj4',
    'total_re2', 'total_sqlite3', 'total_usrsctp', 'total_vorbis',
    'total_woff2', 'total_wpantund', 'total_yara', 'total_zstd'
]

In [2]:
target_project = 0

In [3]:
from sklearn.model_selection import train_test_split
import torch
import torch, gc

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import timeit

import data
import data_loader as dl
import initializer as init
import trainer
import tester
# import predictor
import model_util as mu

import os

In [4]:
gc.collect()
torch.cuda.empty_cache()

# print(torch.cuda.memory_summary(device=None, abbreviated=False))

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
# get all data exept target project
prefix_np, postfix_np, label_np, label_len_np = data.getTrainData(proj_list, proj_list[target_project], version)

Getting data for "total_aspell" from "total_aspell"
Getting data for "total_aspell" from "total_boringssl"
Getting data for "total_aspell" from "total_c-ares"
Getting data for "total_aspell" from "total_exiv2"
Getting data for "total_aspell" from "total_freetype2"
Getting data for "total_aspell" from "total_grok"
Getting data for "total_aspell" from "total_guetzli"
Getting data for "total_aspell" from "total_harfbuzz"
Getting data for "total_aspell" from "total_lcms"
Getting data for "total_aspell" from "total_libarchive"
Getting data for "total_aspell" from "total_libexif"
Getting data for "total_aspell" from "total_libhtp"
Getting data for "total_aspell" from "total_libpng"
Getting data for "total_aspell" from "total_libsndfile"
Getting data for "total_aspell" from "total_libssh"
Getting data for "total_aspell" from "total_libxml2"
Getting data for "total_aspell" from "total_ndpi"
Getting data for "total_aspell" from "total_openthread"
Getting data for "total_aspell" from "total_pcre

In [6]:
# get target project data
# test_prefix, test_postfix, test_label, test_label_len = data.getTestData(proj_list[target_project], version)

In [7]:
# divide train & test
train_prefix, test_prefix, train_postfix, test_postfix, train_label, test_label = train_test_split(
    prefix_np, postfix_np, label_np, test_size = 0.2, random_state = 43
)

# divide train & validation
train_prefix, val_prefix, train_postfix, val_postfix, train_label, val_label = train_test_split(
    train_prefix, train_postfix, train_label, test_size = 0.2, random_state = 43
)

In [8]:
print('test: ', len(test_label))
print('train: ', len(train_label))
print('validation: ', len(val_label))

test:  410806
train:  1314579
validation:  328645


In [9]:
train_dataloader, val_dataloader, test_dataloader =\
    dl.data_loader(
        train_prefix, train_postfix,
        val_prefix, val_postfix,
        test_prefix, test_postfix,
        train_label, val_label, test_label,
        batch_size=1000
    )

In [10]:
overall_title = 'version3'

In [11]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('../tensorboard/'+overall_title+'/tests')

In [12]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3070


In [13]:
# ====================
# set parameters here
# ====================

title = overall_title + '_01'
epochs = 40

max_len, source_code_tokens, token_choices = data.getInfo()

learning_rate = 0.001
weight_decay = 0.0

embed_dim = 100
hidden_size = 200
n_layers = 1
output_size = max(token_choices) + 1
dropout = 0.0
max_length = max_len
input_size = max(token_choices) + 1
device = device

model_name = "seq2seq"
optim_name = "Adam"
loss_fn_name = "CEL"

teacher_forcing_ratio = 0.75

In [14]:
trainer.set_seed(42)

model, loss_fn, optimizer = init.initialize_model(
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    embed_dim=embed_dim,
    hidden_size=hidden_size,
    n_layers=n_layers,
    output_size=output_size,
    dropout=dropout,
    max_length=max_length,
    input_size=input_size,
    device=device
)

In [15]:
start_time = timeit.default_timer()

trainer.train(
    epochs=epochs,
    title=title,
    writer=writer,
    teacher_forcing_ratio=teacher_forcing_ratio,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    device=device
)

end_time = (timeit.default_timer() - start_time) / 60.0

Start training...

 Epoch  |  Train Loss  | Train Acc  | Val Loss | Val Acc | Elapsed
--------------------------------------------------------------------------------
   1    |   0.349649   | 91.386621  | 0.345125 | 91.43  | 420.67
   2    |   0.194348   | 94.687268  | 0.286111 | 92.45  | 410.60
   3    |   0.165480   | 95.430662  | 0.271827 | 93.12  | 415.86
   4    |   0.154132   | 95.772078  | 0.262359 | 93.56  | 414.74
   5    |   0.142117   | 96.104193  | 0.264979 | 93.68  | 408.07
   6    |   0.136009   | 96.290837  | 0.244227 | 93.95  | 413.36
   7    |   0.127757   | 96.508432  | 0.247498 | 94.15  | 406.61
   8    |   0.125461   | 96.603577  | 0.226503 | 94.35  | 412.71
   9    |   0.117911   | 96.803037  | 0.220454 | 94.44  | 407.17
  10    |   0.112779   | 96.955959  | 0.207217 | 94.65  | 410.50
  11    |   0.108809   | 97.076537  | 0.216272 | 94.83  | 412.44
  12    |   0.103575   | 97.224970  | 0.224589 | 94.94  | 408.99
  13    |   0.100172   | 97.323037  | 0.202202 | 94.9

In [16]:
mu.saveModel(overall_title, title, model)

In [17]:
model = mu.getModel(overall_title, title)
print(model)

MySeq2Seq(
  (prefixEncoder): Encoder(
    (embedding): Embedding(155, 100)
    (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
    (hidden_fc): Linear(in_features=200, out_features=100, bias=True)
    (cell_fc): Linear(in_features=200, out_features=100, bias=True)
    (dp): Dropout(p=0.0, inplace=False)
  )
  (postfixEncoder): Encoder(
    (embedding): Embedding(155, 100)
    (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
    (hidden_fc): Linear(in_features=200, out_features=100, bias=True)
    (cell_fc): Linear(in_features=200, out_features=100, bias=True)
    (dp): Dropout(p=0.0, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(155, 100)
    (lstm): LSTM(100, 200, batch_first=True, bidirectional=True)
  )
  (attn): Attention(
    (fc): Linear(in_features=800, out_features=155, bias=True)
    (dp): Dropout(p=0.0, inplace=False)
  )
)


In [18]:
loss, acc = tester.test(
    test_dataloader=test_dataloader,
    model=model,
    loss_fn=loss_fn,
    device=device
)

test loss:  0.19950869256846335
test acc:  95.99090243902438


In [19]:
with open('../stat/'+overall_title, 'a') as f:
        text = title + '\t |\tloss: ' + str(loss) + '\t |\tacc: ' + str(acc) + '\t |\t time: ' + str(round(end_time, 3)) + ' min\n'
        f.write(text)

In [20]:
mu.graphModel(train_dataloader, model, writer, device)

uploaded model graph to tensorboard!
