In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [9]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [10]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('rnn256char/tests')

In [11]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.0,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'DP_test1', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.807893   |  2.645428  |   18.97   |  186.63  
   2    |   2.353638   |  2.225731  |   30.84   |  188.70  
   3    |   2.027780   |  1.772656  |   44.81   |  188.93  
   4    |   1.556977   |  1.320040  |   60.31   |  188.96  
   5    |   1.114545   |  0.956324  |   72.48   |  188.98  
   6    |   0.867155   |  0.817077  |   76.33   |  188.97  
   7    |   0.737082   |  0.737780  |   78.45   |  188.94  
   8    |   0.652884   |  0.664833  |   80.81   |  189.01  
   9    |   0.587133   |  0.623439  |   81.85   |  189.01  
  10    |   0.536878   |  0.609780  |   82.23   |  188.99  
  11    |   0.495430   |  0.582661  |   83.32   |  188.98  
  12    |   0.460122   |  0.576914  |   83.39   |  188.98  
  13    |   0.429441   |  0.565946  |   83.81   |  188.99  
  14    |   0.399769   |  0.568472  |   83.84   |  189.04  
  15    |   0.374020

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'DP_test2', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.836307   |  2.541316  |   20.66   |  193.52  
   2    |   2.416505   |  2.238413  |   30.87   |  194.85  
   3    |   2.141685   |  1.980672  |   37.82   |  195.13  
   4    |   1.800566   |  1.565983  |   51.24   |  195.14  
   5    |   1.424599   |  1.206122  |   64.40   |  195.17  
   6    |   1.102148   |  0.949354  |   72.39   |  195.16  
   7    |   0.900007   |  0.809557  |   76.76   |  195.17  
   8    |   0.780842   |  0.732761  |   78.76   |  195.16  
   9    |   0.702952   |  0.676676  |   80.31   |  195.16  
  10    |   0.645045   |  0.641828  |   81.48   |  195.17  
  11    |   0.597486   |  0.619581  |   82.09   |  195.15  
  12    |   0.562586   |  0.603063  |   82.55   |  195.12  
  13    |   0.531856   |  0.581834  |   83.31   |  195.13  
  14    |   0.505095   |  0.570875  |   83.45   |  195.10  
  15    |   0.481715