In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [9]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [10]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('rnnDropout/tests')

In [11]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test1', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.823952   |  2.432185  |   24.52   |  101.07  
   2    |   2.309772   |  2.083710  |   35.46   |  101.84  
   3    |   2.004178   |  1.825442  |   45.02   |  102.01  
   4    |   1.740131   |  1.562580  |   53.09   |  102.01  
   5    |   1.486980   |  1.329832  |   60.77   |  102.01  
   6    |   1.261412   |  1.137670  |   66.76   |  102.06  
   7    |   1.074080   |  0.997973  |   70.83   |  102.08  
   8    |   0.944747   |  0.903516  |   73.75   |  102.10  
   9    |   0.854611   |  0.851210  |   75.19   |  102.06  
  10    |   0.784563   |  0.810694  |   76.20   |  102.05  
  11    |   0.730961   |  0.777766  |   77.28   |  102.08  
  12    |   0.683919   |  0.757563  |   78.04   |  102.06  
  13    |   0.645395   |  0.734402  |   78.75   |  102.04  
  14    |   0.611134   |  0.714035  |   79.36   |  102.09  
  15    |   0.580495

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.4,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test2', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.825629   |  2.402298  |   25.38   |  101.31  
   2    |   2.315094   |  2.158259  |   33.49   |  102.02  
   3    |   2.090477   |  1.950373  |   39.61   |  102.14  
   4    |   1.861408   |  1.722677  |   47.31   |  102.12  
   5    |   1.643316   |  1.479126  |   55.59   |  102.16  
   6    |   1.436844   |  1.302561  |   61.52   |  102.09  
   7    |   1.248599   |  1.156432  |   66.33   |  102.13  
   8    |   1.121808   |  1.046453  |   69.48   |  102.14  
   9    |   1.014069   |  0.957872  |   72.25   |  102.10  
  10    |   0.932655   |  0.890147  |   74.29   |  102.12  
  11    |   0.868323   |  0.842976  |   75.43   |  102.13  
  12    |   0.818919   |  0.812031  |   76.55   |  102.09  
  13    |   0.773097   |  0.794851  |   76.79   |  102.10  
  14    |   0.737619   |  0.784428  |   77.26   |  102.12  
  15    |   0.706197

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test3', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.853099   |  2.470686  |   23.54   |  101.22  
   2    |   2.355768   |  2.159328  |   32.56   |  101.87  


In [None]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.6,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test4', writer, val_dataloader, epochs=20)

In [None]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.8,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test5', writer, val_dataloader, epochs=20)