In [1]:
import torch
import pandas as pd
from sklearn import metrics

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [6]:
ir.record_ch2idx(ch2idx)

In [7]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [8]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [9]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [10]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [11]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('overfitRNN/tests')

In [12]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="RNN")
                                        
tn.train(device, cnn_rand, optimizer, train_dataloader, 'test7', writer, val_dataloader, epochs=40)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.875202   |  2.532537  |   19.51   |  104.85  
   2    |   2.360859   |  2.168046  |   31.42   |  105.30  
   3    |   1.863051   |  1.506281  |   53.42   |  106.01  
   4    |   1.289822   |  1.161392  |   65.64   |  105.27  
   5    |   0.998004   |  0.964615  |   71.84   |  105.78  
   6    |   0.854869   |  0.836482  |   75.52   |  105.54  
   7    |   0.760332   |  0.796348  |   76.57   |  106.12  
   8    |   0.687627   |  0.745261  |   78.37   |  105.46  
   9    |   0.625972   |  0.736644  |   78.71   |  105.83  
  10    |   0.573777   |  0.714541  |   79.46   |  105.69  
  11    |   0.527650   |  0.722827  |   79.56   |  106.03  
  12    |   0.484513   |  0.690556  |   80.26   |  105.33  
  13    |   0.446231   |  0.679794  |   80.84   |  105.83  
  14    |   0.410169   |  0.699966  |   80.66   |  105.74  
  15    |   0.377796

In [13]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/07_overfitRNN_test7.xlsx', sheet_name='sheet1')

test loss:  1.3971700340154625
test acc:  81.05714285714286
