In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [9]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [10]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('rnn124char/tests')

In [11]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=50,
                                           num_classes=len(class2idx),
                                           n_layers=2,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test1', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.730921   |  2.503849  |   22.21   |   24.43  
   2    |   2.239495   |  2.005803  |   38.46   |   24.64  
   3    |   1.848176   |  1.673680  |   49.33   |   24.78  
   4    |   1.493737   |  1.338903  |   60.20   |   24.78  
   5    |   1.250419   |  1.150411  |   65.73   |   24.97  
   6    |   1.114477   |  1.063708  |   68.70   |   24.84  
   7    |   1.020908   |  0.968873  |   71.27   |   24.70  
   8    |   0.958348   |  0.936184  |   72.58   |   24.76  
   9    |   0.911067   |  0.906889  |   73.22   |   24.60  
  10    |   0.874477   |  0.880099  |   73.91   |   24.87  
  11    |   0.843929   |  0.854971  |   74.76   |   24.78  
  12    |   0.816332   |  0.843822  |   75.13   |   24.68  
  13    |   0.794476   |  0.828054  |   75.82   |   24.65  
  14    |   0.775141   |  0.810172  |   76.27   |   24.93  
  15    |   0.756575

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=50,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '02_test2', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.843421   |  2.582490  |   19.75   |   33.67  
   2    |   2.432237   |  2.257529  |   29.61   |   33.92  
   3    |   2.174925   |  2.042747  |   37.40   |   34.04  
   4    |   1.908771   |  1.766219  |   45.79   |   34.08  
   5    |   1.629903   |  1.458067  |   55.61   |   34.36  
   6    |   1.365654   |  1.258776  |   62.45   |   34.20  
   7    |   1.164778   |  1.071908  |   68.39   |   34.08  
   8    |   1.039109   |  1.019801  |   70.00   |   34.12  
   9    |   0.959090   |  0.933719  |   72.59   |   34.28  
  10    |   0.903277   |  0.896790  |   73.65   |   34.16  
  11    |   0.860945   |  0.857170  |   74.78   |   34.13  
  12    |   0.826874   |  0.839464  |   75.68   |   34.22  
  13    |   0.799564   |  0.833808  |   75.51   |   34.47  
  14    |   0.775882   |  0.813425  |   75.99   |   34.22  
  15    |   0.754942

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=50,
                                           num_classes=len(class2idx),
                                           n_layers=4,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '03_test3', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.991901   |  2.788659  |   13.86   |   42.68  
   2    |   2.628950   |  2.392666  |   25.02   |   43.05  
   3    |   2.303975   |  2.232597  |   30.92   |   43.40  
   4    |   2.085336   |  1.997028  |   39.29   |   43.53  
   5    |   1.871652   |  1.725135  |   47.38   |   43.72  
   6    |   1.636855   |  1.504001  |   54.55   |   43.87  
   7    |   1.444612   |  1.327927  |   60.32   |   43.48  
   8    |   1.259884   |  1.154879  |   66.10   |   43.64  
   9    |   1.117096   |  1.035296  |   69.58   |   43.50  
  10    |   1.020326   |  0.969382  |   71.31   |   43.92  
  11    |   0.952639   |  0.935990  |   72.86   |   43.99  
  12    |   0.898908   |  0.885044  |   74.17   |   43.90  
  13    |   0.859873   |  0.846507  |   74.93   |   43.70  
  14    |   0.827956   |  0.840797  |   75.33   |   43.77  
  15    |   0.800852

In [14]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=2,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '04_test4', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.664194   |  2.362713  |   26.03   |   69.61  
   2    |   2.100706   |  1.884608  |   42.00   |   69.99  
   3    |   1.678286   |  1.449050  |   56.15   |   69.97  
   4    |   1.313627   |  1.152669  |   65.92   |   69.82  
   5    |   1.078557   |  0.986937  |   70.74   |   69.93  
   6    |   0.941663   |  0.915712  |   72.87   |   69.93  
   7    |   0.856832   |  0.842735  |   75.39   |   69.87  
   8    |   0.794432   |  0.820550  |   75.80   |   69.84  
   9    |   0.746776   |  0.789277  |   76.72   |   70.03  
  10    |   0.708373   |  0.760467  |   77.64   |   69.94  
  11    |   0.674547   |  0.739573  |   78.48   |   69.78  
  12    |   0.645105   |  0.734952  |   78.40   |   69.99  
  13    |   0.619190   |  0.715613  |   79.24   |   69.93  
  14    |   0.596978   |  0.718090  |   79.30   |   70.00  
  15    |   0.574560

In [15]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '05_test5', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.798207   |  2.444780  |   23.82   |  101.71  
   2    |   2.290440   |  2.254048  |   31.60   |  102.41  
   3    |   2.038372   |  1.855263  |   42.11   |  102.49  
   4    |   1.732166   |  1.544083  |   53.15   |  102.54  
   5    |   1.392043   |  1.242708  |   63.00   |  102.51  
   6    |   1.123448   |  1.015105  |   70.13   |  102.56  
   7    |   0.963892   |  0.920218  |   72.77   |  102.54  
   8    |   0.869023   |  0.843860  |   75.14   |  102.52  
   9    |   0.799516   |  0.809308  |   76.10   |  102.51  
  10    |   0.749964   |  0.781782  |   77.13   |  102.58  
  11    |   0.707085   |  0.755400  |   77.70   |  102.49  
  12    |   0.669204   |  0.729884  |   78.70   |  102.53  
  13    |   0.640186   |  0.714768  |   79.08   |  102.51  
  14    |   0.612234   |  0.703977  |   79.31   |  102.50  
  15    |   0.587658

In [16]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=4,
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '06_test6', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.971360   |  2.713250  |   15.07   |  134.38  
   2    |   2.477760   |  2.390184  |   26.23   |  135.51  
   3    |   2.149523   |  1.968181  |   39.33   |  135.90  
   4    |   1.736620   |  1.511146  |   53.01   |  135.94  
   5    |   1.352171   |  1.186959  |   64.70   |  135.96  
   6    |   1.103340   |  1.016152  |   69.93   |  135.96  
   7    |   0.966288   |  0.936134  |   72.37   |  135.97  
   8    |   0.877038   |  0.860335  |   74.66   |  135.98  
   9    |   0.812605   |  0.838636  |   75.45   |  135.91  
  10    |   0.763026   |  0.789155  |   76.92   |  135.97  
  11    |   0.720501   |  0.769108  |   77.41   |  135.97  
  12    |   0.686484   |  0.752485  |   77.93   |  135.96  
  13    |   0.657089   |  0.722284  |   78.85   |  135.93  
  14    |   0.629699   |  0.710446  |   79.30   |  135.96  
  15    |   0.605109