In [1]:
import torch
import pandas as pd
from sklearn import metrics
import numpy as np

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
import gensim
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
word_list = list(ch2idx.keys())
ch_list = list(ch2idx.values())

In [8]:
def load_pretrained_vectors():
    embeddings = np.random.uniform(-0.25, 0.25, (len(ch2idx), 300))
    embeddings[ch2idx['<pad>']] = np.zeros((300,))

    word_list = list(ch2idx.keys())
    id_list = list(ch2idx.values())

    # Load pretrained vectors
    count = 0
    for i in range(len(ch2idx)):
        word_position = id_list.index(i)
        word = word_list[word_position]

        if word in word2vec_model:
            count += 1
            embeddings[ch2idx[word]] = word2vec_model[word]

    print(f"There are {count} / {len(ch2idx)} pretrained vectors found.")

    return embeddings

In [9]:
# Load pretrained vectors
embeddings = load_pretrained_vectors()
embeddings = torch.tensor(embeddings)

There are 74 / 97 pretrained vectors found.


In [10]:
embeddings.shape

torch.Size([97, 300])

In [11]:
ir.record_ch2idx(ch2idx)

In [12]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [13]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [14]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [15]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [16]:
for step, batch in enumerate(train_dataloader):
             # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            print(b_input_ids.type())
            break

torch.cuda.LongTensor


In [17]:
for step, batch in enumerate(train_dataloader):
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)
    print(b_input_ids.shape)
    break

torch.Size([50, 124])


In [18]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('overfitRNN/tests')

In [19]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.01,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test46', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   3.044270   |  3.043047  |   5.89    |  103.32  
   2    |   3.035444   |  3.002513  |   8.08    |  103.00  
   3    |   2.920043   |  2.849290  |   11.67   |  103.72  
   4    |   2.924457   |  3.057142  |   4.65    |  104.26  
   5    |   2.954536   |  2.777857  |   12.10   |  104.87  
   6    |   2.637661   |  2.394476  |   23.62   |  104.61  
   7    |   2.544741   |  2.374283  |   24.98   |  104.72  
   8    |   2.258255   |  2.056419  |   34.97   |  104.56  
   9    |   1.968518   |  1.779808  |   44.22 

In [20]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/46_overfitRNN_test46.xlsx', sheet_name='sheet1')

test loss:  0.7465288404907499
test acc:  78.69047619047619
