In [1]:
import torch
import pandas as pd
from sklearn import metrics

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [10]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [11]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [12]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('with_prec_recall/cnn')

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 16, 16, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test3_cnn', writer, val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.391910   |  0.835852  |   76.81   |   73.39  
   2    |   0.794447   |  0.648879  |   82.21   |   73.20  
   3    |   0.652138   |  0.581333  |   83.94   |   73.18  
   4    |   0.573458   |  0.551433  |   84.61   |   73.15  
   5    |   0.518128   |  0.528120  |   85.33   |   73.09  
   6    |   0.472800   |  0.512077  |   85.94   |   73.03  
   7    |   0.441053   |  0.507951  |   85.93   |   73.01  
   8    |   0.410278   |  0.496286  |   86.48   |   73.01  
   9    |   0.386647   |  0.493021  |   86.71   |   73.02  
  10    |   0.364926   |  0.495663  |   86.63   |   73.01  
  11    |   0.345848   |  0.496871  |   86.92   |   73.01  
  12    |   0.331930   |  0.499155  |   86.80   |   73.00  
  13    |   0.315864   |  0.512689  |   86.71   |   73.03  
  14    |   0.304009   |  0.499484  |   86.89   |   73.00  
  15    |   0.293900

In [14]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

test loss:  0.5283101548751196
test acc:  87.73809523809524


In [15]:
print(metrics.confusion_matrix(tot_label.cpu(), tot_pred.cpu()))

[[ 848    1    9    7    0    4   18    0    0    5    1   15    7   12
     2    3    5    0    6    3   15]
 [   0  890    7    5    5    2    2    3    6    6    1   21    4    5
     0    7   12    2   10    5    7]
 [   3    2  930    3    1    0    6    3    0    1    1    4   40    0
     0    2   18    6    5    4    1]
 [   9    6    9  699    3   15   19    4    1   15   55    6    0    2
     2    2    1    1    3    4  134]
 [   1    6    4    2  979    0    0    2    1    1    2   20    2    3
     1    4    9    0    8    3    2]
 [   3    1    0   19    1  832   24    4    0    1    6   29    0    0
     0    1    0    0    4    0   18]
 [  11    1    5    7    0   32  914    1    0    7    0   31    6    2
     3    2    2    1    4    1    5]
 [   6    6    6    6    3    4    1  870    0    3    3    7    1    3
     2    7    8    1    5    5    4]
 [   0    0    0    0    0    0    0    0 1000    0    0    0    0    0
     0    0    0    0    0    0    2]
 [   3    

In [16]:
results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/precision_recall_results.xlsx', sheet_name='final_with_test_512a')


In [None]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 8, 8, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test3_cnn', writer, val_dataloader, epochs=20)