In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [9]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [10]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('rnnEmbLay/tests')

In [11]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=20,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'EMB_test1', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.975701   |  2.839595  |   11.93   |   99.74  
   2    |   2.713847   |  2.507514  |   21.07   |  100.65  
   3    |   2.420895   |  2.288824  |   28.70   |  100.93  
   4    |   2.210128   |  2.048845  |   35.71   |  101.19  
   5    |   1.954627   |  1.778286  |   45.06   |  101.27  
   6    |   1.663059   |  1.502168  |   54.57   |  101.28  
   7    |   1.400794   |  1.252069  |   62.84   |  101.27  
   8    |   1.207395   |  1.102171  |   67.40   |  101.28  
   9    |   1.071432   |  1.004654  |   70.34   |  101.31  
  10    |   0.968053   |  0.927908  |   72.74   |  101.35  
  11    |   0.889113   |  0.871712  |   74.51   |  102.06  
  12    |   0.827378   |  0.842852  |   75.36   |  101.34  
  13    |   0.776249   |  0.805540  |   76.10   |  101.30  
  14    |   0.730918   |  0.785301  |   76.96   |  101.30  
  15    |   0.693305

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=50,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'EMB_test2', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.879673   |  2.556890  |   20.29   |  100.49  
   2    |   2.393848   |  2.247700  |   31.07   |  101.26  
   3    |   2.115112   |  1.945344  |   39.82   |  101.44  
   4    |   1.831760   |  1.673026  |   49.29   |  101.47  
   5    |   1.518497   |  1.303801  |   61.53   |  101.51  
   6    |   1.230478   |  1.106801  |   67.41   |  101.46  
   7    |   1.047943   |  0.961553  |   71.72   |  101.46  
   8    |   0.929843   |  0.884785  |   74.03   |  101.48  
   9    |   0.846466   |  0.838950  |   75.44   |  101.45  
  10    |   0.785719   |  0.790321  |   76.81   |  101.51  
  11    |   0.732973   |  0.767378  |   77.40   |  101.51  
  12    |   0.690738   |  0.743681  |   78.32   |  101.51  
  13    |   0.654648   |  0.714335  |   79.21   |  101.48  
  14    |   0.619500   |  0.701817  |   79.68   |  101.50  
  15    |   0.587777

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=70,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'EMB_test3', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.844391   |  2.552644  |   20.14   |  100.97  
   2    |   2.383381   |  2.275306  |   29.70   |  101.82  
   3    |   2.096223   |  1.933571  |   39.77   |  101.92  
   4    |   1.793507   |  1.610627  |   50.73   |  102.00  
   5    |   1.424716   |  1.221779  |   63.68   |  101.98  
   6    |   1.162608   |  1.057564  |   68.75   |  101.97  
   7    |   1.000601   |  0.966052  |   71.90   |  101.97  
   8    |   0.896507   |  0.882964  |   74.20   |  102.03  
   9    |   0.821875   |  0.814919  |   76.19   |  102.06  
  10    |   0.761787   |  0.788362  |   76.99   |  102.02  
  11    |   0.714178   |  0.773136  |   77.40   |  102.05  
  12    |   0.673500   |  0.739848  |   78.59   |  102.01  
  13    |   0.635817   |  0.743026  |   78.59   |  101.99  
  14    |   0.604778   |  0.699512  |   79.73   |  101.99  
  15    |   0.575052