In [1]:
import torch
import pandas as pd
from sklearn import metrics
import numpy as np

import data
import utils
import info_recorder as ir
import data_loader as dl
import initializer as init
import trainer as tn
import tester as ts

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [3]:
df, size_info, size_dict = data.get_df()

In [4]:
sourceCode_np = df.sourceCode.values
codeClass_np = df.classLabel.values

In [5]:
import gensim
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_sourceCodes, ch2idx, max_len = utils.tokenize(sourceCode_np)
input_ids = utils.encode(tokenized_sourceCodes, ch2idx, max_len)

Tokenizing...



In [7]:
word_list = list(ch2idx.keys())
ch_list = list(ch2idx.values())

In [8]:
def load_pretrained_vectors():
    embeddings = np.random.uniform(-0.25, 0.25, (len(ch2idx), 300))
    embeddings[ch2idx['<pad>']] = np.zeros((300,))

    word_list = list(ch2idx.keys())
    id_list = list(ch2idx.values())

    # Load pretrained vectors
    count = 0
    for i in range(len(ch2idx)):
        word_position = id_list.index(i)
        word = word_list[word_position]

        if word in word2vec_model:
            count += 1
            embeddings[ch2idx[word]] = word2vec_model[word]

    print(f"There are {count} / {len(ch2idx)} pretrained vectors found.")

    return embeddings

In [9]:
# Load pretrained vectors
embeddings = load_pretrained_vectors()
embeddings = torch.tensor(embeddings)

There are 74 / 97 pretrained vectors found.


In [10]:
embeddings.shape

torch.Size([97, 300])

In [11]:
ir.record_ch2idx(ch2idx)

In [12]:
encoded_class2idx, class2idx, num_classes = utils.tokenize_encode_class(codeClass_np)

In [13]:
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, encoded_class2idx, test_size = 0.1, random_state = 43
)

In [14]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_inputs, train_labels, test_size = 0.1, random_state = 43
)

In [15]:
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader, test_dataloader = dl.data_loader(train_inputs, val_inputs, test_inputs, train_labels, val_labels, test_labels, batch_size=50)

In [16]:
for step, batch in enumerate(train_dataloader):
             # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            print(b_input_ids.type())
            break

torch.cuda.LongTensor


In [17]:
for step, batch in enumerate(train_dataloader):
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)
    print(b_input_ids.shape)
    break

torch.Size([50, 124])


In [18]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('overfitRNN/tests')

In [19]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.0,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test32', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.898185   |  1.174756  |   65.07   |  106.55  
   2    |   0.975482   |  0.860161  |   75.05   |  106.70  
   3    |   0.764490   |  0.751425  |   78.03   |  107.93  
   4    |   0.645863   |  0.712622  |   79.27   |  108.22  
   5    |   0.561175   |  0.668954  |   80.52   |  107.08  
   6    |   0.490347   |  0.676423  |   80.52   |  106.13  
   7    |   0.430986   |  0.677771  |   81.03   |  106.35  
   8    |   0.385158   |  0.682339  |   81.60   |  105.90  
   9    |   0.341640   |  0.695439  |   81.36   |  106.48  

In [20]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/32_overfitRNN_test32.xlsx', sheet_name='sheet1')

test loss:  1.2159057943593887
test acc:  80.82857142857142


In [21]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.1,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test33', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.1, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.394310   |  1.833423  |   41.71   |  110.52  
   2    |   1.405896   |  1.087618  |   67.84   |  110.01  
   3    |   0.970974   |  0.856075  |   75.17   |  111.03  
   4    |   0.807076   |  0.787752  |   77.10   |  110.38  
   5    |   0.711184   |  0.721605  |   78.64   |  110.38  
   6    |   0.639781   |  0.697769  |   79.51   |  109.99  
   7    |   0.583452   |  0.673317  |   80.29   |  110.63  
   8    |   0.540771   |  0.651374  |   81.27   |  110.71  
   9    |   0.501685   |  0.644377  |   81.61 

In [22]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/33_overfitRNN_test33.xlsx', sheet_name='sheet1')

test loss:  0.7444851442107132
test acc:  83.25238095238096


In [23]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=200,
                                           hidden_size=200,
                                           num_classes=len(class2idx),
                                           n_layers=4,
                                           dropout=0.2,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test34', writer, val_dataloader, epochs=40)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 200)
  (rnn): LSTM(200, 200, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=400, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.940018   |  1.130193  |   66.66   |  297.49  
   2    |   1.006795   |  0.856133  |   75.11   |  296.69  
   3    |   0.818541   |  0.783719  |   77.38   |  298.04  
   4    |   0.717860   |  0.723074  |   79.04   |  296.47  
   5    |   0.648208   |  0.694091  |   79.98   |  296.92  
   6    |   0.596966   |  0.656593  |   81.42   |  297.68  
   7    |   0.552798   |  0.635621  |   81.82   |  296.58  
   8    |   0.518125   |  0.636903  |   82.09   |  296.85  
   9    |   0.490046   |  0.615496  |   82.71 

In [24]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/34_overfitRNN_test34.xlsx', sheet_name='sheet1')

test loss:  0.6726503895506972
test acc:  83.85714285714285


In [25]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=200,
                                           hidden_size=200,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test35', writer, val_dataloader, epochs=60)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 200)
  (rnn): LSTM(200, 200, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=400, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   1.755507   |  1.034776  |   69.71   |  218.13  
   2    |   0.941308   |  0.836113  |   75.71   |  217.94  
   3    |   0.782569   |  0.743861  |   78.59   |  219.43  
   4    |   0.692912   |  0.695817  |   79.94   |  218.58  
   5    |   0.630653   |  0.667529  |   80.86   |  218.87  
   6    |   0.582690   |  0.653435  |   80.98   |  218.73  
   7    |   0.547676   |  0.638264  |   81.59   |  218.87  
   8    |   0.515845   |  0.623400  |   82.23   |  219.36  
   9    |   0.489851   |  0.637845  |   82.02 

In [26]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/35_overfitRNN_test35.xlsx', sheet_name='sheet1')

test loss:  0.6712065548414275
test acc:  83.25238095238096


In [27]:
# CNN-rand: Word vectors are randomly initialized.
tn.set_seed(42)
cnn_rand, optimizer = init.initilize_model(pretrained_embedding=None,
                                           device=device,
                                           vocab_size=len(ch2idx),
                                           embed_dim=100,
                                           hidden_size=100,
                                           num_classes=len(class2idx),
                                           n_layers=3,
                                           dropout=0.2,
                                           learning_rate=0.001,
                                           optimizerName="Adam",
                                           modelType="RNN")

print(cnn_rand)

tn.train(device, cnn_rand, optimizer, train_dataloader, 'test36', writer, val_dataloader, epochs=60)

doing without pretrained model!!!
RNNClassifier(
  (emb): Embedding(97, 100)
  (rnn): LSTM(100, 100, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=200, out_features=300, bias=True)
  (fc4): Linear(in_features=300, out_features=21, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.178552   |  1.475914  |   54.58   |  110.62  
   2    |   1.215547   |  0.996089  |   70.68   |  110.37  
   3    |   0.930331   |  0.850798  |   74.87   |  110.81  
   4    |   0.807368   |  0.761877  |   77.71   |  110.44  
   5    |   0.730663   |  0.726634  |   78.86   |  110.27  
   6    |   0.672792   |  0.691436  |   79.78   |  110.71  
   7    |   0.631053   |  0.676140  |   80.41   |  110.52  
   8    |   0.593968   |  0.686432  |   80.15   |  110.60  
   9    |   0.565463   |  0.642568  |   81.37 

In [28]:
tot_pred, tot_label = ts.test(device, cnn_rand, test_dataloader)

results = metrics.classification_report(tot_label.cpu(), tot_pred.cpu(), output_dict=True)
results_df = pd.DataFrame.from_dict(results).transpose()
results_df.to_excel('../result/36_overfitRNN_test36.xlsx', sheet_name='sheet1')

test loss:  0.6682467930373691
test acc:  83.65714285714286
