In [1]:
import torch
import pandas as pd
from sklearn import metrics

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [10]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [11]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [12]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('with_prec_recall/tests')

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test3_cnn', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.154640   |  0.639006  |   82.35   |  129.19  
   2    |   0.627950   |  0.505511  |   86.22   |  129.27  
   3    |   0.513300   |  0.453222  |   87.42   |  129.32  
   4    |   0.446759   |  0.423555  |   88.24   |  129.22  
   5    |   0.396855   |  0.400179  |   88.95   |  129.15  
   6    |   0.360231   |  0.398795  |   89.16   |  129.13  
   7    |   0.328866   |  0.378936  |   89.68   |  129.10  
   8    |   0.304838   |  0.364237  |   90.19   |  129.06  
   9    |   0.282731   |  0.364575  |   90.42   |  129.07  
  10    |   0.265651   |  0.360790  |   90.57   |  129.07  
  11    |   0.247932   |  0.360132  |   90.59   |  129.06  
  12    |   0.234224   |  0.371967  |   90.80   |  129.02  
  13    |   0.223537   |  0.368326  |   90.78   |  129.02  
  14    |   0.211308   |  0.364104  |   91.03   |  129.05  
  15    |   0.205210

In [14]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

test loss:  0.37255751395686754
test acc:  91.64285714285715


In [15]:
print(metrics.confusion_matrix(tot_label.cpu(), tot_pred.cpu()))

[[ 902    3    3    7    1    2    6    2    0    2    2    4    5    6
     1    0    0    1    3    1   10]
 [   6  947    7    3    5    0    0    1    1    1    3    9    2    0
     0    2    4    0    2    1    6]
 [   6    3  952    0    4    1    2    5    0    1    0    2   34    0
     1    0    4    7    4    2    2]
 [   5    3    1  798    1    7    6    0    0   18   55    3    0    1
     3    1    2    0    1    0   85]
 [   0    6    4    0 1001    0    0    1    2    2    2   13    4    1
     0    1    1    1    3    3    5]
 [   1    5    0    8    1  829   31    1    0    1    7   20    4    0
     1    1    1    0    3    0   29]
 [   5    0    3    3    1   19  951    0    0    2    5   22    5    1
     1    3    2    0    3    0    9]
 [   2    6    1    2    8    1    1  911    0    0    0    2    5    1
     1    3    4    0    2    0    1]
 [   0    0    0    0    0    0    0    0 1002    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   3    

In [16]:
results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/03_final_with_test_512a.xlsx', sheet_name='final_with_test_512a')


In [17]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 8, 8, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test4_cnn', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.237978   |  0.713495  |   80.30   |  101.54  
   2    |   0.688631   |  0.554891  |   84.84   |  101.49  
   3    |   0.573060   |  0.487026  |   86.43   |  101.45  
   4    |   0.511364   |  0.458646  |   87.16   |  101.43  
   5    |   0.471863   |  0.436352  |   87.87   |  101.39  
   6    |   0.440895   |  0.420905  |   88.12   |  101.37  
   7    |   0.414610   |  0.414445  |   88.49   |  101.39  
   8    |   0.392261   |  0.399562  |   88.92   |  101.36  
   9    |   0.376906   |  0.391564  |   89.31   |  101.35  
  10    |   0.360639   |  0.389785  |   89.34   |  101.33  
  11    |   0.346487   |  0.387506  |   89.60   |  101.34  
  12    |   0.332179   |  0.391459  |   89.57   |  101.35  
  13    |   0.323386   |  0.384420  |   89.74   |  101.31  
  14    |   0.316238   |  0.375497  |   89.93   |  101.30  
  15    |   0.305780

In [18]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/04_final_with_test_512b.xlsx', sheet_name='final_with_test_512b')

test loss:  0.36603207735433463
test acc:  90.82380952380953
