In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [8]:
df, size_info, size_dict = data.get_df()

In [9]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [11]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [13]:
ir.record_ch2idx(ch2idx)

In [15]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [16]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [18]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [21]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('betterCNN/sourceCodeCNN{}'.format(timestamp))

In [23]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=20,
                                        filter_sizes=[8],
                                        num_filters=[100],
                                        learning_rate=0.25,
                                        num_classes=len(class2idx),
                                        dropout=0.5)
tn.train(device, cnn_rand, optimizer, train_dataloader, '4Layer', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.127972   |  1.456987  |   59.99   |   11.04  
   2    |   1.583577   |  1.238361  |   65.91   |   10.19  
   3    |   1.463917   |  1.153546  |   67.99   |   10.19  
   4    |   1.410556   |  1.110863  |   68.90   |   10.22  
   5    |   1.371479   |  1.079923  |   69.81   |   10.30  
   6    |   1.348754   |  1.062568  |   69.97   |   10.27  
   7    |   1.329906   |  1.048667  |   70.62   |   10.29  
   8    |   1.317575   |  1.030562  |   70.93   |   10.21  
   9    |   1.306439   |  1.031103  |   70.97   |   10.19  
  10    |   1.300152   |  1.022082  |   71.22   |   10.28  
  11    |   1.292616   |  1.009193  |   71.45   |   10.26  
  12    |   1.288032   |  1.005755  |   71.63   |   10.19  
  13    |   1.283285   |  0.999962  |   71.66   |   10.18  
  14    |   1.276215   |  0.997508  |   71.82   |   10.21  
  15    |   1.271875

In [24]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=20,
                                        filter_sizes=[8, 16, 32, 64],
                                        num_filters=[100, 200, 200, 100],
                                        learning_rate=0.25,
                                        num_classes=len(class2idx),
                                        dropout=0.5)
tn.train(device, cnn_rand, optimizer, train_dataloader, '4Layer', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.771193   |  1.141484  |   68.77   |   28.82  
   2    |   1.144154   |  0.934006  |   73.76   |   28.99  
   3    |   0.975043   |  0.850122  |   76.05   |   29.38  
   4    |   0.877829   |  0.806139  |   77.16   |   29.25  
   5    |   0.808553   |  0.777856  |   78.03   |   29.13  
   6    |   0.757765   |  0.762757  |   78.46   |   29.03  
   7    |   0.716770   |  0.758471  |   78.73   |   28.87  
   8    |   0.684832   |  0.752787  |   78.82   |   28.74  
   9    |   0.653707   |  0.748775  |   79.01   |   28.99  
  10    |   0.631112   |  0.747849  |   79.20   |   28.92  
  11    |   0.609794   |  0.752750  |   79.28   |   29.57  
  12    |   0.590289   |  0.762333  |   79.53   |   28.57  
  13    |   0.573721   |  0.756216  |   79.41   |   28.36  
  14    |   0.561561   |  0.772601  |   79.59   |   28.51  
  15    |   0.549910