In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [10]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('248chars/cnn')

In [12]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            filter_sizes=[8, 8, 8, 8],
#                                            num_filters=[100, 200, 200, 100],
#                                            num_classes=len(class2idx),
#                                            dropout=0.5,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_248data_8KS', writer, val_dataloader, epochs=20)

In [13]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            filter_sizes=[8, 8, 16, 16],
#                                            num_filters=[100, 200, 200, 100],
#                                            num_classes=len(class2idx),
#                                            dropout=0.5,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '02_4Layer_w/16_8KS', writer, val_dataloader, epochs=20)

In [14]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                            vocab_size=len(ch2idx),
#                                            embed_dim=100,
#                                            filter_sizes=[8, 16, 16, 8],
#                                            num_filters=[100, 200, 200, 100],
#                                            num_classes=len(class2idx),
#                                            dropout=0.5,
#                                            learning_rate=0.25,
#                                            optimizerName="Adadelta",
#                                            modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '03_4Layer_mid16_8KS', writer, val_dataloader, epochs=20)

In [15]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.25,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '04_4Layer_w/16_DP0.25_8KS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.276246   |  0.743430  |   79.41   |   80.12  
   2    |   0.669179   |  0.590509  |   83.72   |   82.23  
   3    |   0.509532   |  0.548174  |   84.52   |   82.40  
   4    |   0.413553   |  0.503855  |   86.09   |   80.83  
   5    |   0.343636   |  0.500661  |   86.33   |   79.33  
   6    |   0.289125   |  0.497284  |   86.75   |   78.14  
   7    |   0.244581   |  0.500031  |   87.06   |   78.00  
   8    |   0.210959   |  0.511446  |   86.96   |  101.67  
   9    |   0.186363   |  0.525254  |   87.04   |  213.46  
  10    |   0.162785   |  0.526551  |   87.18   |   97.78  
  11    |   0.145472   |  0.556575  |   87.40   |   82.43  
  12    |   0.130451   |  0.561892  |   87.40   |   82.67  
  13    |   0.117535   |  0.581733  |   87.16   |   82.49  
  14    |   0.109264   |  0.592301  |   87.20   |   83.47  
  15    |   0.099375

In [16]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.75,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '04_4Layer_w/16_DP0.75_8KS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.638853   |  0.980057  |   73.36   |   83.61  
   2    |   1.052225   |  0.792415  |   78.88   |   82.69  
   3    |   0.924927   |  0.714917  |   80.65   |   82.97  
   4    |   0.858078   |  0.666322  |   82.06   |   83.47  
   5    |   0.819791   |  0.640179  |   82.80   |   83.29  
   6    |   0.787651   |  0.620616  |   83.26   |   82.70  
   7    |   0.768803   |  0.615219  |   83.52   |   82.55  
   8    |   0.750129   |  0.592537  |   84.05   |   80.98  
   9    |   0.734515   |  0.587970  |   84.23   |   81.33  
  10    |   0.721551   |  0.584782  |   84.25   |   80.88  
  11    |   0.710671   |  0.582200  |   84.48   |   80.82  
  12    |   0.699656   |  0.567211  |   84.61   |   81.36  
  13    |   0.690464   |  0.565315  |   84.66   |   81.48  
  14    |   0.680634   |  0.560116  |   85.01   |   81.33  
  15    |   0.671501