In [1]:
import torch

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.2, random_state = 43
)

In [10]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = dl.data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('dirKS/cnn')

In [12]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                         vocab_size=len(ch2idx),
#                                         embed_dim=100,
#                                         filter_sizes=[64, 32, 16, 8],
#                                         num_filters=[100, 200, 200, 100],
#                                         num_classes=len(class2idx),
#                                         dropout=0.5,
#                                         learning_rate=0.25,
#                                         optimizerName="Adadelta",
#                                         modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '01_4Layer_lessKS', writer, val_dataloader, epochs=20)

In [13]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                         vocab_size=len(ch2idx),
#                                         embed_dim=100,
#                                         filter_sizes=[8, 16, 32, 64],
#                                         num_filters=[100, 200, 200, 100],
#                                         num_classes=len(class2idx),
#                                         dropout=0.5,
#                                         learning_rate=0.25,
#                                         optimizerName="Adadelta",
#                                         modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '02_4Layer_moreKS', writer, val_dataloader, epochs=20)

In [14]:
# # CNN-rand: Word vectors are randomly initialized.
# tn.set_seed(42)
# cnn_rand, optimizer = init.initilize_model(device=device,
#                                         vocab_size=len(ch2idx),
#                                         embed_dim=100,
#                                         filter_sizes=[32, 16, 8],
#                                         num_filters=[100, 200, 100],
#                                         num_classes=len(class2idx),
#                                         dropout=0.5,
#                                         learning_rate=0.25,
#                                         optimizerName="Adadelta",
#                                         modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '03_3Layer_lessKS', writer, val_dataloader, epochs=20)

In [15]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                        vocab_size=len(ch2idx),
                                        embed_dim=100,
                                        filter_sizes=[8, 16, 32],
                                        num_filters=[100, 200, 100],
                                        num_classes=len(class2idx),
                                        dropout=0.5,
                                        learning_rate=0.25,
                                        optimizerName="Adadelta",
                                        modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '04_3Layer_moreKS', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.653917   |  1.076117  |   69.87   |   43.18  
   2    |   1.050978   |  0.856041  |   75.85   |   42.49  
   3    |   0.880146   |  0.770578  |   78.00   |   42.27  
   4    |   0.780376   |  0.727186  |   79.35   |   42.30  
   5    |   0.708870   |  0.699039  |   80.15   |   43.00  
   6    |   0.658159   |  0.682416  |   80.57   |   42.76  
   7    |   0.613191   |  0.668153  |   81.21   |   40.62  
   8    |   0.572898   |  0.658098  |   81.42   |   42.56  
   9    |   0.544534   |  0.649805  |   81.75   |   42.90  
  10    |   0.517814   |  0.653204  |   82.08   |   43.57  
  11    |   0.497013   |  0.646188  |   82.29   |   43.03  
  12    |   0.474245   |  0.649149  |   82.29   |   42.90  
  13    |   0.462767   |  0.650199  |   82.61   |   42.87  
  14    |   0.443475   |  0.683116  |   82.42   |   42.29  
  15    |   0.433253