In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [10]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('512chars/cnn')

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 8, 8, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_512data_8KS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.208024   |  0.687872  |   81.41   |  104.41  
   2    |   0.677051   |  0.535532  |   85.25   |  104.53  
   3    |   0.566109   |  0.489545  |   86.57   |  104.63  
   4    |   0.505964   |  0.446910  |   87.80   |  104.67  
   5    |   0.467027   |  0.430970  |   88.25   |  104.66  
   6    |   0.435583   |  0.425795  |   88.53   |  104.63  
   7    |   0.412378   |  0.403669  |   88.91   |  104.63  
   8    |   0.391239   |  0.395171  |   89.38   |  104.61  
   9    |   0.373679   |  0.387955  |   89.60   |  104.63  
  10    |   0.357510   |  0.379645  |   89.85   |  104.62  
  11    |   0.345772   |  0.382043  |   89.82   |  104.60  
  12    |   0.335334   |  0.380302  |   89.90   |  104.61  
  13    |   0.320171   |  0.378990  |   90.03   |  104.60  
  14    |   0.313365   |  0.372272  |   90.25   |  104.58  
  15    |   0.304238

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_512data_8+16KS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.183757   |  0.625697  |   83.00   |  132.86  
   2    |   0.618340   |  0.497635  |   86.53   |  132.71  
   3    |   0.506801   |  0.446053  |   87.89   |  132.60  
   4    |   0.441065   |  0.416229  |   88.75   |  132.55  
   5    |   0.391872   |  0.391453  |   89.26   |  132.52  
   6    |   0.354988   |  0.382385  |   89.68   |  132.49  
   7    |   0.322876   |  0.377792  |   89.91   |  133.03  
   8    |   0.299157   |  0.362116  |   90.30   |  132.74  
   9    |   0.275040   |  0.362983  |   90.58   |  132.47  
  10    |   0.256259   |  0.358487  |   90.85   |  132.63  
  11    |   0.241221   |  0.366385  |   90.81   |  132.26  
  12    |   0.227594   |  0.375654  |   90.77   |  132.30  
  13    |   0.217501   |  0.372700  |   91.01   |  132.35  
  14    |   0.205863   |  0.368005  |   91.03   |  132.56  
  15    |   0.195554

In [14]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.65,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_512data_65DP_8+16KS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.272304   |  0.673995  |   82.02   |  169.25  
   2    |   0.717860   |  0.549097  |   85.30   |  313.46  
   3    |   0.613829   |  0.489380  |   86.83   |  311.64  
   4    |   0.556838   |  0.462381  |   87.48   |  313.87  
   5    |   0.521028   |  0.442982  |   87.99   |  311.56  
   6    |   0.489890   |  0.426444  |   88.47   |  311.34  
   7    |   0.464771   |  0.421841  |   88.52   |  313.86  
   8    |   0.444185   |  0.410466  |   89.05   |  192.49  
   9    |   0.427912   |  0.410958  |   89.24   |  132.24  
  10    |   0.414225   |  0.404322  |   89.44   |  132.17  
  11    |   0.405319   |  0.400589  |   89.67   |  132.11  
  12    |   0.391491   |  0.394453  |   89.69   |  132.11  
  13    |   0.378846   |  0.391588  |   89.93   |  132.18  
  14    |   0.369864   |  0.388876  |   90.01   |  132.17  
  15    |   0.364897