# Transformer for Java Analysis (Partial)

This notebook is intended to be run on a unix server with a single GPU processor and pytorch preconfigured to run on the machine.

Initial 2 experiments were done for method name prediction before switching over to snippet summarization.

In [None]:
# Step 1: change directory so that OpenNMT-py actually works
%cd OpenNMT-py

# Step 6: install some additional needed packages for OpenNMT
!pip install -r requirements.txt
# and because there is some weird cuda mismatch, this is what we need to use to make training work...
# !pip uninstall torch
# !pip install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
!pip install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl

# Step 7: Remove previous run data
# !rm data/*.pt # prepped.vocab.pt, prepped.train.0.pt, prepped.valid.0.pt, etc

In [2]:
import torch
torch.cuda.get_device_name(0)

'Tesla K80'

# Experiment 1: Predicting method names from tokenized, AST java

source: java-small (code2seq)

In [None]:
# NOTE: Have to use a reasonable shard size to break up the training data 
#       or we'll run out of RAM and crash
!python preprocess.py \
    -train_src data/src.train.txt \
    -train_tgt data/tgt.train.txt \
    -valid_src data/src.val.txt \
    -valid_tgt data/tgt.val.txt \
    -shard_size 100000 \
    -save_data data/prepped

In [None]:
# cmds for running directly in the shell

# with pre-trained model (fails)
python train.py -train_from model/first_attempt06022019_step_60000.pt -data data/prepped -save_model model/first_attempt06022019 -keep_checkpoint 2 -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 -encoder_type transformer -decoder_type transformer -position_encoding -train_steps 200000  -max_generator_batches 0 -dropout 0.1 -batch_size 40 -batch_type tokens -normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 0.2 -max_grad_norm 0 -param_init 0 -param_init_glorot -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0
# from scratch (hopefully doesn't fail anymore)
python train.py -data data/prepped -save_model model/second_attempt06022019 -keep_checkpoint 2 -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 -encoder_type transformer -decoder_type transformer -position_encoding -train_steps 200000  -max_generator_batches 0 -dropout 0.1 -batch_size 40 -batch_type tokens -normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 0.2 -max_grad_norm 0 -param_init 0 -param_init_glorot -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0

In [None]:
# transformer
!python train.py \
    -data data/prepped \
    -save_model model/second_attempt06022019 \
    -keep_checkpoint 2 \
    -layers 6 \
    -rnn_size 512 \
    -word_vec_size 512 \
    -transformer_ff 2048 \
    -heads 8 \
    -encoder_type transformer \
    -decoder_type transformer \
    -position_encoding \
    -train_steps 200000  \
    -max_generator_batches 0 \
    -dropout 0.1 \
    -batch_size 40 \
    -batch_type tokens \
    -normalization tokens \
    -accum_count 2 \
    -optim adam \
    -adam_beta2 0.998 \
    -decay_method noam \
    -warmup_steps 8000 \
    -learning_rate 0.2 \
    -max_grad_norm 0 \
    -param_init 0  \
    -param_init_glorot \
    -label_smoothing 0.1 \
    -valid_steps 10000 \
    -save_checkpoint_steps 10000 \
    -world_size 1 \
    -gpu_ranks 0

# originally used a learning rate of 2 despite OpenNMT's recommendation 
# of using 0.0001

[2019-06-02 09:20:15,687 INFO]  * src vocab size = 50002
[2019-06-02 09:20:15,688 INFO]  * tgt vocab size = 50004
[2019-06-02 09:20:15,688 INFO] Building model...
[2019-06-02 09:20:21,197 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(50002, 512, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=512, out_features=512, bias=True)
          (linear_values): Linear(in_features=512, out_features=512, bias=True)
          (linear_query): Linear(in_features=512, out_features=512, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.1)
          (final_linear): Linear(in_features=512, out_features=512, bias=True)
        )
     

In [3]:
# !python translate.py \
#     -model model/second_attempt06022019_step_200000.pt \
#     -src data/src.test.txt \
#     #-tgt data/tgt.test.txt \
#     -output pred.txt \
#     -report_bleu \
#     -report_rouge \
#     -report_time \
#     -replace_unk \
#     #-verbose \
#     -batch_size 30 \
#     -gpu 0

!python translate.py \
    -model model/second_attempt06022019_step_200000.pt \
    -src data/src.test.txt \
    -tgt data/tgt.test.txt \
    -output pred.txt \
    -report_time \
    -gpu 0

[2019-06-04 03:09:39,389 INFO] Translating shard 0.
PRED AVG SCORE: -1.6411, PRED PPL: 5.1609
GOLD AVG SCORE: -2.2565, GOLD PPL: 9.5492
Total translation time (s): 38.122161
Average translation time (s): 0.003812
Tokens per second: 262.314616
[2019-06-04 03:10:18,092 INFO] Translating shard 1.
PRED AVG SCORE: -1.6601, PRED PPL: 5.2601
GOLD AVG SCORE: -2.4936, GOLD PPL: 12.1050
Total translation time (s): 37.308131
Average translation time (s): 0.003731
Tokens per second: 267.904063
[2019-06-04 03:10:55,744 INFO] Translating shard 2.
PRED AVG SCORE: -1.6499, PRED PPL: 5.2064
GOLD AVG SCORE: -2.4776, GOLD PPL: 11.9125
Total translation time (s): 14.461360
Average translation time (s): 0.003762
Tokens per second: 265.811800


# Experiment 2: Predicting method names from tokenized, AST java

However, this time, instead of using the encoding originally provided by code2seq, swap the pipes and commas with spaces and reprocess. This actually made it alot better because the pipes were viewed as just another character in a reallllly long token, which resulted in an incredibly sparse vocabulary and most predictions were simply set to <unk> for "unknown". Most of the time it didn't know what to say. 

source: java-small (code2seq)

In [2]:
def replace_pipe_and_comma_w_space(infile_name, outfile_name):
    if infile_name == outfile_name:
        return 'please use different names'
    with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile:
        for line in infile:
            new_line = line.rstrip('\n').replace("|", " ").replace(",", " ")
            outfile.write(new_line + '\n')
    return 'done'

In [4]:
%cd OpenNMT-py/data
replace_pipe_and_comma_w_space('src.train.txt', 'src2.train.txt')
replace_pipe_and_comma_w_space('src.val.txt',   'src2.val.txt')
replace_pipe_and_comma_w_space('src.test.txt',  'src2.test.txt')
replace_pipe_and_comma_w_space('tgt.train.txt', 'tgt2.train.txt')
replace_pipe_and_comma_w_space('tgt.val.txt',   'tgt2.val.txt')
replace_pipe_and_comma_w_space('tgt.test.txt',  'tgt2.test.txt')

/home/jupyter/OpenNMT-py/data


'done'

In [5]:
%cd ..

/home/jupyter/OpenNMT-py


In [6]:
!python preprocess.py \
    -train_src data/src2.train.txt \
    -train_tgt data/tgt2.train.txt \
    -valid_src data/src2.val.txt \
    -valid_tgt data/tgt2.val.txt \
    -shard_size 1000000 \
    -save_data data/prepped_w_spaces

[2019-06-04 05:34:26,003 INFO] Extracting features...
[2019-06-04 05:34:26,003 INFO]  * number of source features: 0.
[2019-06-04 05:34:26,003 INFO]  * number of target features: 0.
[2019-06-04 05:34:26,003 INFO] Building `Fields` object...
[2019-06-04 05:34:26,003 INFO] Building & saving training data...
[2019-06-04 05:34:26,003 INFO] Reading source and target files: data/src2.train.txt data/tgt2.train.txt.
[2019-06-04 05:34:27,069 INFO] Building shard 0.
[2019-06-04 05:34:53,565 INFO]  * saving 0th train data shard to data/prepped_w_spaces.train.0.pt.
[2019-06-04 05:34:56,486 INFO] Building shard 1.
[2019-06-04 05:35:20,842 INFO]  * saving 1th train data shard to data/prepped_w_spaces.train.1.pt.
[2019-06-04 05:35:24,146 INFO] Building shard 2.
[2019-06-04 05:35:47,669 INFO]  * saving 2th train data shard to data/prepped_w_spaces.train.2.pt.
[2019-06-04 05:35:50,744 INFO] Building shard 3.
[2019-06-04 05:36:14,657 INFO]  * saving 3th train data shard to data/prepped_w_spaces.train.3.

In [11]:
# transformer
!python train.py \
    -data data/prepped_w_spaces \
    -save_model model/encoded_w_spaces \
    -keep_checkpoint 2 \
    -layers 6 \
    -rnn_size 512 \
    -word_vec_size 512 \
    -transformer_ff 2048 \
    -heads 8 \
    -encoder_type transformer \
    -decoder_type transformer \
    -position_encoding \
    -train_steps 200000  \
    -max_generator_batches 0 \
    -dropout 0.1 \
    -batch_size 32 \
    -batch_type tokens \
    -normalization tokens \
    -accum_count 2 \
    -optim adam \
    -adam_beta2 0.998 \
    -decay_method noam \
    -warmup_steps 8000 \
    -learning_rate 0.2 \
    -max_grad_norm 0 \
    -param_init 0  \
    -param_init_glorot \
    -label_smoothing 0.1 \
    -valid_steps 200001 \
    -save_checkpoint_steps 10000 \
    -world_size 1 \
    -gpu_ranks 0

# originally used a learning rate of 2 despite OpenNMT's recommendation 
# of using 0.0001

[2019-06-04 07:19:08,452 INFO]  * src vocab size = 32428
[2019-06-04 07:19:08,452 INFO]  * tgt vocab size = 6072
[2019-06-04 07:19:08,452 INFO] Building model...
[2019-06-04 07:19:12,246 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(32428, 512, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=512, out_features=512, bias=True)
          (linear_values): Linear(in_features=512, out_features=512, bias=True)
          (linear_query): Linear(in_features=512, out_features=512, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.1)
          (final_linear): Linear(in_features=512, out_features=512, bias=True)
        )
      

In [24]:
!python translate.py \
    -model model/encoded_w_spaces_step_200000.pt \
    -src data/src2.test.txt \
    -tgt data/tgt2.test.txt \
    -output pred.txt \
    -shard_size 1000 \
    -batch_size 5 \
    -report_time \
    -replace_unk \
    -gpu 0

[2019-06-05 00:45:42,795 INFO] Translating shard 0.
PRED AVG SCORE: -1.8667, PRED PPL: 6.4672
GOLD AVG SCORE: -4.4272, GOLD PPL: 83.7006
Total translation time (s): 152.506381
Average translation time (s): 0.152506
Tokens per second: 9.101259
[2019-06-05 00:48:15,402 INFO] Translating shard 1.
PRED AVG SCORE: -2.1031, PRED PPL: 8.1913
GOLD AVG SCORE: -4.9395, GOLD PPL: 139.6958
Total translation time (s): 173.910670
Average translation time (s): 0.173911
Tokens per second: 7.181848
[2019-06-05 00:51:09,390 INFO] Translating shard 2.
PRED AVG SCORE: -2.0947, PRED PPL: 8.1228
GOLD AVG SCORE: -4.6251, GOLD PPL: 102.0170
Total translation time (s): 168.198397
Average translation time (s): 0.168198
Tokens per second: 7.538716
[2019-06-05 00:53:57,663 INFO] Translating shard 3.
PRED AVG SCORE: -2.0005, PRED PPL: 7.3924
GOLD AVG SCORE: -4.3738, GOLD PPL: 79.3483
Total translation time (s): 155.833411
Average translation time (s): 0.155833
Tokens per second: 8.810691
[2019-06-05 00:56:33,569 I

# Experiment 3: Summarizing from tokenized java

source: http://leclair.tech/data/funcom/#procdata

In [26]:
!python preprocess.py \
    -train_src data/funcom/train/functions.train \
    -train_tgt data/funcom/train/comments.train \
    -valid_src data/funcom/valid/functions.valid \
    -valid_tgt data/funcom/valid/comments.valid \
    -shard_size 100000 \
    -save_data data/funcom/_prepped/prepped

[2019-06-05 03:55:44,050 INFO] Extracting features...
[2019-06-05 03:55:44,050 INFO]  * number of source features: 0.
[2019-06-05 03:55:44,050 INFO]  * number of target features: 0.
[2019-06-05 03:55:44,050 INFO] Building `Fields` object...
[2019-06-05 03:55:44,050 INFO] Building & saving training data...
[2019-06-05 03:55:44,050 INFO] Reading source and target files: data/funcom/train/functions.train data/funcom/train/comments.train.
[2019-06-05 03:55:44,097 INFO] Building shard 0.
[2019-06-05 03:55:47,867 INFO]  * saving 0th train data shard to data/funcom/_prepped/prepped.train.0.pt.
[2019-06-05 03:55:50,841 INFO] Building shard 1.
[2019-06-05 03:55:54,421 INFO]  * saving 1th train data shard to data/funcom/_prepped/prepped.train.1.pt.
[2019-06-05 03:55:57,420 INFO] Building shard 2.
[2019-06-05 03:56:00,806 INFO]  * saving 2th train data shard to data/funcom/_prepped/prepped.train.2.pt.
[2019-06-05 03:56:03,455 INFO] Building shard 3.
[2019-06-05 03:56:06,716 INFO]  * saving 3th tr

In [30]:
# transformer
!python train.py \
    -data data/funcom/_prepped/prepped \
    -save_model model/funcom_06042019 \
    -keep_checkpoint 1 \
    -layers 6 \
    -rnn_size 512 \
    -word_vec_size 512 \
    -transformer_ff 2048 \
    -heads 8 \
    -encoder_type transformer \
    -decoder_type transformer \
    -position_encoding \
    -train_steps 200000  \
    -max_generator_batches 0 \
    -dropout 0.1 \
    -batch_size 40 \
    -batch_type tokens \
    -normalization tokens \
    -accum_count 2 \
    -optim adam \
    -adam_beta2 0.998 \
    -decay_method noam \
    -warmup_steps 8000 \
    -learning_rate 0.2 \
    -max_grad_norm 0 \
    -param_init 0  \
    -param_init_glorot \
    -label_smoothing 0.1 \
    -valid_steps 50000 \
    -save_checkpoint_steps 10000 \
    -world_size 1 \
    -gpu_ranks 0

# originally used a learning rate of 2 despite OpenNMT's recommendation 
# of using 0.0001

[2019-06-05 04:11:20,361 INFO]  * src vocab size = 50002
[2019-06-05 04:11:20,362 INFO]  * tgt vocab size = 50004
[2019-06-05 04:11:20,362 INFO] Building model...
[2019-06-05 04:11:25,420 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(50002, 512, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=512, out_features=512, bias=True)
          (linear_values): Linear(in_features=512, out_features=512, bias=True)
          (linear_query): Linear(in_features=512, out_features=512, bias=True)
          (softmax): Softmax()
          (dropout): Dropout(p=0.1)
          (final_linear): Linear(in_features=512, out_features=512, bias=True)
        )
     

In [31]:
!python translate.py \
    -model model/funcom_06042019_step_200000.pt \
    -src data/funcom/test/functions.test \
    -tgt data/funcom/test/comments.test \
    -output pred.txt \
    -report_time \
    -gpu 0

[2019-06-05 17:34:55,409 INFO] Translating shard 0.
PRED AVG SCORE: -1.2553, PRED PPL: 3.5090
GOLD AVG SCORE: -3.6657, GOLD PPL: 39.0816
Total translation time (s): 226.015424
Average translation time (s): 0.022602
Tokens per second: 236.098931
[2019-06-05 17:38:41,691 INFO] Translating shard 1.
PRED AVG SCORE: -1.2895, PRED PPL: 3.6310
GOLD AVG SCORE: -3.8377, GOLD PPL: 46.4206
Total translation time (s): 221.970803
Average translation time (s): 0.022197
Tokens per second: 238.734101
[2019-06-05 17:42:23,875 INFO] Translating shard 2.
PRED AVG SCORE: -1.2815, PRED PPL: 3.6020
GOLD AVG SCORE: -3.8054, GOLD PPL: 44.9431
Total translation time (s): 222.623388
Average translation time (s): 0.022262
Tokens per second: 234.809112
[2019-06-05 17:46:06,729 INFO] Translating shard 3.
PRED AVG SCORE: -1.0835, PRED PPL: 2.9551
GOLD AVG SCORE: -3.6356, GOLD PPL: 37.9252
Total translation time (s): 195.466068
Average translation time (s): 0.019547
Tokens per second: 277.459923
[2019-06-05 17:49:22