In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [10]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('stableKZ/cnn')

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=100,
                                        filter_sizes=[8, 8, 8, 8],
                                        num_filters=[100, 200, 200, 100],
                                        num_classes=len(class2idx),
                                        dropout=0.5,
                                        learning_rate=0.25,
                                        optimizerName="Adadelta",
                                        modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_8KZ', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.618258   |  1.038399  |   71.15   |   42.27  
   2    |   1.011508   |  0.822850  |   76.84   |   41.56  
   3    |   0.857686   |  0.747237  |   79.16   |   41.63  
   4    |   0.776052   |  0.694726  |   80.57   |   41.71  
   5    |   0.715773   |  0.665403  |   81.11   |   41.47  
   6    |   0.675068   |  0.652499  |   81.62   |   41.38  
   7    |   0.639666   |  0.635737  |   82.29   |   41.57  
   8    |   0.610466   |  0.622353  |   82.50   |   41.87  
   9    |   0.590977   |  0.619485  |   82.81   |   41.82  
  10    |   0.570781   |  0.613228  |   83.11   |   41.48  
  11    |   0.551683   |  0.607242  |   83.14   |   41.68  
  12    |   0.537102   |  0.601666  |   83.36   |   41.59  
  13    |   0.523023   |  0.602169  |   83.30   |   42.30  
  14    |   0.515597   |  0.600037  |   83.39   |   41.97  
  15    |   0.501656

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=100,
                                        filter_sizes=[16, 16, 16, 16],
                                        num_filters=[100, 200, 200, 100],
                                        num_classes=len(class2idx),
                                        dropout=0.5,
                                        learning_rate=0.25,
                                        optimizerName="Adadelta",
                                        modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '02_4Layer_16KZ', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.549276   |  0.977869  |   72.90   |   60.29  
   2    |   0.947057   |  0.786983  |   77.79   |   59.76  
   3    |   0.780520   |  0.705370  |   80.21   |   59.74  
   4    |   0.680181   |  0.670607  |   81.17   |   59.94  
   5    |   0.608945   |  0.641855  |   82.01   |   60.62  
   6    |   0.553080   |  0.625989  |   82.44   |   59.91  
   7    |   0.509075   |  0.618870  |   82.91   |   59.96  
   8    |   0.472580   |  0.612303  |   83.22   |   60.22  
   9    |   0.444007   |  0.620402  |   83.55   |   59.92  
  10    |   0.417853   |  0.607397  |   83.77   |   59.77  
  11    |   0.394852   |  0.615766  |   83.75   |   60.17  
  12    |   0.376072   |  0.616638  |   83.80   |   61.30  
  13    |   0.359796   |  0.629411  |   84.02   |   61.98  
  14    |   0.345248   |  0.626226  |   84.07   |   60.74  
  15    |   0.333054

In [14]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=100,
                                        filter_sizes=[32, 32, 32, 32],
                                        num_filters=[100, 200, 200, 100],
                                        num_classes=len(class2idx),
                                        dropout=0.5,
                                        learning_rate=0.25,
                                        optimizerName="Adadelta",
                                        modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '03_4Layer_32KZ', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.630295   |  1.099746  |   68.81   |   88.06  
   2    |   1.013283   |  0.862471  |   75.76   |   87.74  
   3    |   0.781505   |  0.769756  |   78.16   |   87.90  
   4    |   0.640039   |  0.732046  |   79.55   |   88.01  
   5    |   0.537669   |  0.721706  |   79.99   |   88.16  
   6    |   0.458645   |  0.714242  |   80.34   |   87.73  
   7    |   0.402419   |  0.728888  |   80.70   |   87.96  
   8    |   0.356646   |  0.730557  |   81.01   |   87.67  
   9    |   0.319621   |  0.760302  |   81.20   |   87.64  
  10    |   0.287817   |  0.765290  |   81.05   |   87.81  
  11    |   0.266825   |  0.804605  |   80.67   |   87.80  
  12    |   0.250435   |  0.797842  |   81.20   |   87.90  
  13    |   0.231023   |  0.799861  |   81.23   |   87.53  
  14    |   0.217110   |  0.825971  |   81.41   |   87.88  
  15    |   0.207337

In [15]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=100,
                                        filter_sizes=[64, 64, 64, 64],
                                        num_filters=[100, 200, 200, 100],
                                        num_classes=len(class2idx),
                                        dropout=0.5,
                                        learning_rate=0.25,
                                        optimizerName="Adadelta",
                                        modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '04_4Layer_64KZ', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.765748   |  1.251953  |   64.18   |  146.07  
   2    |   1.136959   |  1.026207  |   70.85   |  147.91  
   3    |   0.821731   |  0.949986  |   73.05   |  149.56  
   4    |   0.608853   |  0.934778  |   73.94   |  144.26  
   5    |   0.463158   |  0.958263  |   74.42   |  144.32  
   6    |   0.370929   |  0.980603  |   74.52   |  144.27  
   7    |   0.301239   |  1.012578  |   74.89   |  140.04  
   8    |   0.261558   |  1.055656  |   74.83   |  135.76  
   9    |   0.224346   |  1.109466  |   74.99   |  132.54  
  10    |   0.200673   |  1.106147  |   74.81   |  143.58  
  11    |   0.177811   |  1.138093  |   74.87   |  145.48  
  12    |   0.165410   |  1.175365  |   74.98   |  148.27  
  13    |   0.150731   |  1.197865  |   75.09   |  145.55  
  14    |   0.143135   |  1.229035  |   75.13   |  145.88  
  15    |   0.132451