In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [9]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [10]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('rnnDropout/tests')

In [11]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            hidden_size=100,
#                                            num_classes=len(class2idx),
#                                            n_layers=3,
#                                            dropout=0.2,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="RNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test1', writer, val_dataloader, epochs=20)

In [12]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            hidden_size=100,
#                                            num_classes=len(class2idx),
#                                            n_layers=3,
#                                            dropout=0.4,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="RNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test2', writer, val_dataloader, epochs=20)

In [13]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            hidden_size=100,
#                                            num_classes=len(class2idx),
#                                            n_layers=3,
#                                            dropout=0.5,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="RNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test3', writer, val_dataloader, epochs=20)

In [14]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.6,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test4', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.868705   |  2.556974  |   19.16   |  101.09  
   2    |   2.375006   |  2.180175  |   31.81   |  102.27  
   3    |   2.135508   |  1.922206  |   40.59   |  102.37  
   4    |   1.906106   |  1.751261  |   46.26   |  102.41  
   5    |   1.718330   |  1.545089  |   53.52   |  102.44  
   6    |   1.562286   |  1.403214  |   58.17   |  102.45  
   7    |   1.419094   |  1.253615  |   63.24   |  102.43  
   8    |   1.284506   |  1.157939  |   66.24   |  102.47  
   9    |   1.172933   |  1.087968  |   68.47   |  102.46  
  10    |   1.078332   |  0.995820  |   71.13   |  102.43  
  11    |   1.002619   |  0.930820  |   73.20   |  102.42  
  12    |   0.943884   |  0.947030  |   72.97   |  102.47  
  13    |   0.897706   |  0.861017  |   75.00   |  102.40  
  14    |   0.858312   |  0.830905  |   76.07   |  102.45  
  15    |   0.822813

In [15]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.8,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test5', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.956503   |  2.720240  |   14.93   |  101.54  
   2    |   2.602651   |  2.396043  |   25.21   |  102.20  
   3    |   2.343279   |  2.247315  |   29.80   |  102.30  
   4    |   2.197451   |  1.999378  |   38.19   |  102.35  
   5    |   2.057893   |  1.875390  |   41.88   |  102.34  
   6    |   1.912420   |  1.729005  |   47.01   |  102.37  
   7    |   1.766952   |  1.553942  |   52.36   |  102.31  
   8    |   1.632930   |  1.413228  |   58.00   |  102.31  
   9    |   1.515231   |  1.361403  |   60.35   |  102.36  
  10    |   1.419730   |  1.235845  |   63.54   |  102.30  
  11    |   1.326418   |  1.149700  |   66.76   |  102.38  
  12    |   1.245950   |  1.090337  |   68.86   |  102.36  
  13    |   1.182854   |  1.046332  |   69.94   |  102.33  
  14    |   1.126985   |  1.003518  |   71.43   |  102.36  
  15    |   1.082748