In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [10]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [11]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [12]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('cnnDD/cnn')

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test2', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.607090   |  0.997892  |   72.21   |   47.38  
   2    |   0.974105   |  0.791910  |   77.58   |   45.03  
   3    |   0.806800   |  0.712327  |   79.94   |   45.07  
   4    |   0.708188   |  0.676445  |   80.96   |   45.07  
   5    |   0.639850   |  0.635927  |   82.33   |   45.07  
   6    |   0.587380   |  0.615193  |   82.81   |   45.09  
   7    |   0.547045   |  0.603775  |   83.13   |   45.07  
   8    |   0.509194   |  0.593805  |   83.83   |   45.08  
   9    |   0.486893   |  0.594752  |   83.92   |   45.08  
  10    |   0.457497   |  0.582666  |   84.35   |   45.08  
  11    |   0.436989   |  0.584612  |   84.38   |   45.05  
  12    |   0.420171   |  0.590012  |   84.46   |   45.06  
  13    |   0.400792   |  0.600629  |   84.34   |   45.07  
  14    |   0.387504   |  0.584277  |   84.60   |   45.06  
  15    |   0.374274

In [14]:
ts.test(device, cnn_rand, test_dataloader)

test loss:  0.6299812532429184
test acc:  84.26666666666667
