In [47]:
import torch
import pandas as pd

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [4]:
df, size_info, size_dict = data.get_df()

In [5]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
ir.record_ch2idx(ch2idx)

In [8]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [9]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [10]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [11]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [12]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('preRec/cnn')

In [13]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           filter_sizes=[8, 8, 8, 8],
                                           num_filters=[100, 200, 200, 100],
                                           num_classes=len(class2idx),
                                           dropout=0.5,
                                           learning_rate=0.25,
                                           optimizerName="Adadelta",
                                           modelType="CNN")
                                        
# tn.train(device, cnn_rand, optimizer, train_dataloader, '01_test1', writer, val_dataloader, epochs=20)

In [14]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

test loss:  3.0753957981155033
test acc:  4.852380952380953


In [15]:
from sklearn import metrics

In [17]:
print(metrics.confusion_matrix(tot_label.cpu(), tot_pred.cpu()))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  961]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0 1000]
 [   0    0    0    0    0    0    0    5    0    0    0    0    0    0
     0    0    0    0    0    0 1025]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  990]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0 1050]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  943]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0 1035]
 [   0    0    0    0    0    0    0    3    0    0    0    0    0    0
     0    0    0    0    0    0  948]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0 1002]
 [   0    

In [50]:
a = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)

b = pd.DataFrame.from_dict(a)

print(b)

b.to_excel('pandas_to_excel.xlsx', sheet_name='new_sheet_name')


             0.0     1.0     2.0    3.0     4.0    5.0     6.0         7.0  \
precision    0.0     0.0     0.0    0.0     0.0    0.0     0.0    0.250000   
recall       0.0     0.0     0.0    0.0     0.0    0.0     0.0    0.003155   
f1-score     0.0     0.0     0.0    0.0     0.0    0.0     0.0    0.006231   
support    961.0  1000.0  1030.0  990.0  1050.0  943.0  1035.0  951.000000   

              8.0    9.0  ...   14.0   15.0    16.0    17.0   18.0    19.0  \
precision     0.0    0.0  ...    0.0    0.0     0.0     0.0    0.0     0.0   
recall        0.0    0.0  ...    0.0    0.0     0.0     0.0    0.0     0.0   
f1-score      0.0    0.0  ...    0.0    0.0     0.0     0.0    0.0     0.0   
support    1002.0  984.0  ...  996.0  988.0  1033.0  1020.0  960.0  1015.0   

                  20.0  accuracy     macro avg  weighted avg  
precision     0.048411  0.048524      0.014210      0.013664  
recall        1.000000  0.048524      0.047769      0.048524  
f1-score      0.092351  0.048

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
