In [1]:
import torch
import pandas as pd
from sklearn import metrics
import numpy as np

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
import gensim
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
word_list = list(ch2idx.keys())
ch_list = list(ch2idx.values())

In [8]:
def load_pretrained_vectors():
    embeddings = np.random.uniform(-0.25, 0.25, (len(ch2idx), 300))
    embeddings[ch2idx['<pad>']] = np.zeros((300,))

    word_list = list(ch2idx.keys())
    id_list = list(ch2idx.values())

    # Load pretrained vectors
    count = 0
    for i in range(len(ch2idx)):
        word_position = id_list.index(i)
        word = word_list[word_position]

        if word in word2vec_model:
            count += 1
            embeddings[ch2idx[word]] = word2vec_model[word]

    print(f"There are {count} / {len(ch2idx)} pretrained vectors found.")

    return embeddings

In [9]:
# Load pretrained vectors
embeddings = load_pretrained_vectors()
embeddings = torch.tensor(embeddings)

There are 74 / 97 pretrained vectors found.


In [10]:
embeddings.shape

torch.Size([97, 300])

In [11]:
ir.record_ch2idx(ch2idx)

In [12]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [13]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [14]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [15]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [16]:
for step, batch in enumerate(train_dataloader):
             # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            print(b_input_ids.type())
            break

torch.cuda.LongTensor


In [17]:
for step, batch in enumerate(train_dataloader):
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)
    print(b_input_ids.shape)
    break

torch.Size([50, 124])


In [18]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('overfitRNN/tests')

In [20]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test29', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.216269   |  1.451440  |   56.29   |  110.46  
   2    |   1.197605   |  0.951771  |   72.31   |  111.21  
   3    |   0.923150   |  0.833584  |   75.70   |  110.29  
   4    |   0.808864   |  0.772096  |   77.42   |  111.10  
   5    |   0.732118   |  0.731341  |   78.65   |  110.83  
   6    |   0.672669   |  0.693519  |   79.79   |  111.44  
   7    |   0.626605   |  0.680132  |   80.35   |  111.24  
   8    |   0.591741   |  0.662210  |   81.07   |  111.15  
   9    |   0.561495   |  0.642473  |   81.27 

In [21]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/29_overfitRNN_test29.xlsx', sheet_name='sheet1')

test loss:  0.6552621992925803
test acc:  83.61904761904762


In [22]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.5,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test30', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.320105   |  1.618197  |   49.52   |  110.14  
   2    |   1.452031   |  1.120646  |   66.98   |  111.27  
   3    |   1.122011   |  0.956497  |   72.38   |  111.68  
   4    |   0.993471   |  0.870230  |   75.03   |  111.09  
   5    |   0.915376   |  0.820823  |   76.61   |  111.17  
   6    |   0.860823   |  0.784516  |   77.24   |  111.49  
   7    |   0.821962   |  0.766448  |   77.84   |  111.41  
   8    |   0.788362   |  0.746976  |   78.38   |  111.09  
   9    |   0.760905   |  0.726712  |   79.16 

In [23]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/30_overfitRNN_test30.xlsx', sheet_name='sheet1')

test loss:  0.6403531291832526
test acc:  82.4047619047619


In [24]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.3,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test31', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.133701   |  1.370714  |   57.70   |  112.75  
   2    |   1.193715   |  0.967960  |   71.85   |  112.11  
   3    |   0.943489   |  0.848652  |   75.04   |  111.96  
   4    |   0.836379   |  0.793319  |   76.96   |  112.33  
   5    |   0.767676   |  0.757590  |   77.87   |  114.73  
   6    |   0.715471   |  0.710469  |   79.34   |  115.85  
   7    |   0.674643   |  0.692945  |   79.60   |  111.99  
   8    |   0.642583   |  0.677809  |   80.53   |  110.22  
   9    |   0.617396   |  0.668722  |   80.53 

In [25]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/31_overfitRNN_test31.xlsx', sheet_name='sheet1')

test loss:  0.6016800244294461
test acc:  83.55714285714285
