NOTE: The model was trained for 10 epochs, taking around 13 hours on a GPU, yielding outputs that are reasonably understandable. It needs additional training on multiple GPUs for further improvement and close to correct output.

Check `loss_plot.png` to see the training progress, which has not yet reached saturation, suggesting that more training iterations are required.

In [28]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/python-dataset-source-code-summarization/preprocess.py
/kaggle/input/python-dataset-source-code-summarization/python_test_dataset.csv
/kaggle/input/python-dataset-source-code-summarization/python_Sample_dataset.csv
/kaggle/input/python-dataset-source-code-summarization/python_dataset/python_dataset.csv
/kaggle/input/1epoch-model-sourcecodemodel/x_tokenizer.pickle
/kaggle/input/1epoch-model-sourcecodemodel/trained_model.h5
/kaggle/input/1epoch-model-sourcecodemodel/y_tokenizer.pickle
/kaggle/input/1epoch-model-sourcecodemodel/decoder_model.h5
/kaggle/input/1epoch-model-sourcecodemodel/encoder_model_epochs_10.h5
/kaggle/input/1epoch-model-sourcecodemodel/decoder_model_epochs_10.h5
/kaggle/input/1epoch-model-sourcecodemodel/trained_model_epochs_10.h5
/kaggle/input/1epoch-model-sourcecodemodel/encoder_model.h5


In [29]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
# from preprocess import preprocess, tokenize

import warnings
warnings.filterwarnings('ignore')

In [30]:
import sys
sys.path.append('/kaggle/input/python-dataset-source-code-summarization')

from preprocess import preprocess, tokenize

**Loading trained files:**

In [31]:
# tokenizers
x_tokenizer_path = "/kaggle/input/1epoch-model-sourcecodemodel/x_tokenizer.pickle"
y_tokenizer_path = "/kaggle/input/1epoch-model-sourcecodemodel/y_tokenizer.pickle"
with open(x_tokenizer_path, 'rb') as handle:
    x_tokenizer = pickle.load(handle)
with open(y_tokenizer_path, 'rb') as handle:
    y_tokenizer = pickle.load(handle)

# models
from tensorflow.keras.models import load_model

model = load_model('/kaggle/input/1epoch-model-sourcecodemodel/trained_model.h5')
encoder_model = load_model('/kaggle/input/1epoch-model-sourcecodemodel/encoder_model_epochs_10.h5')
decoder_model = load_model('/kaggle/input/1epoch-model-sourcecodemodel/decoder_model_epochs_10.h5')

In [32]:
# confirm loadings
x_vocab_size = x_tokenizer.num_words + 1
print("X Vocab Size: ", x_vocab_size)
y_vocab_size = y_tokenizer.num_words + 1
print("Y Vocab Size: ", y_vocab_size)

X Vocab Size:  26090
Y Vocab Size:  10603


In [33]:
model.summary()
# encoder_model.summary()
# decoder_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 200)     5218000     ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 100, 300),   601200      ['embedding[0][0]']              
                                 (None, 300),                                                     
                                 (None, 300)]                                                     
                                                                                              

### Utilities for Prediction:

In [34]:
max_code_len = 100
max_summary_len = 25

In [35]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [36]:
# reverse_target_word_index

In [37]:
# for generating the predicted decoded sequence
def predict_summary(input_seq):

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c], verbose=0)

        # Sample a token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # TODO: Pick 2nd max prob, if sampled_token_index=0
        if(sampled_token_index == 0):
            sampled_token_index = np.argsort(output_tokens[0, -1, :])[-2]

        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)      # Check

    return decoded_sentence

In [38]:
# Utility functions to generate strings sequences from integer (token) sequences
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i != target_word_index['eostok']:
            newString += reverse_target_word_index[i] + ' '

    return newString

def seq2code(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            newString += reverse_source_word_index[i] + ' '

    return newString

### Test Dataset and Prediction:

In [39]:
testpath = "/kaggle/input/python-dataset-source-code-summarization/python_test_dataset.csv"
test = pd.read_csv(testpath)
test = test[['code', 'docstring']]
print(test.shape)
test.head()

(99, 2)


Unnamed: 0,code,docstring
0,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))",How to convert a list of multiple integers int...
1,"r = int(''.join(map(str, x)))",How to convert a list of multiple integers int...
2,datetime.strptime('2010-11-13 10:33:54.227806'...,how to convert a datetime string back to datet...
3,"[(i, sum(j) / len(j)) for i, j in list(d.items...",Averaging the values in a dictionary based on ...
4,"zip([1, 2], [3, 4])",zip lists in python


In [40]:
# preprocess test
post_test = preprocess(test.copy(), max_code_len, max_summary_len)
print(post_test.shape)
post_test.head()

(99, 2)


Unnamed: 0,code,docstring
0,sum i for d in enumerate ::,sostok how to convert list of multiple integer...
1,r int join map str,sostok how to convert list of multiple integer...
2,datetime strptime m h s,sostok how to convert datetime string back to ...
3,sum len for j in list items,sostok averaging the values in dictionary base...
4,zip,sostok zip lists in python eostok


In [41]:
# tokenize test
test_code = tokenize(list(post_test['code'].values), max_pad_len=max_code_len, tokenizer_path=x_tokenizer_path, thresh=3, fit_on_texts=False)
test_summary = tokenize(list(post_test['docstring'].values), max_pad_len=max_summary_len, tokenizer_path=y_tokenizer_path, thresh=2, fit_on_texts=False)

In [42]:
test_summary[:5]

array([[   1,    6,    5,   45,   11,    7,   34,  295,   32,  149,  188,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   1,    6,    5,   45,   11,    7,   34,  295,   32,  149,  188,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   1,    6,    5,   45,  126,   16,  474,    5,  126,   52,    2,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   1, 2799,    9,   31,    4,   38,   91,   24,    9,   67,    2,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   1,  372,   57,    4,    3,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int32)

In [43]:
# predict for some of the sequences
for i in range(0, 5):
    # print ('Orig Code:', test['code'][i])
    print ('Post Code:', seq2code(test_code[i]))
    print ('Original summary:', seq2summary(test_summary[i]))
    print ('Predicted summary:', predict_summary(test_code[i].reshape(1, max_code_len)))
    print ('\n')

Post Code: sum i for d in enumerate 
Original summary: how to convert list of multiple integers into single integer 
Predicted summary:  how to get the index of the first day of week in python


Post Code: r int join map str 
Original summary: how to convert list of multiple integers into single integer 
Predicted summary:  how to get the first item of an item in list


Post Code: datetime strptime m h s 
Original summary: how to convert datetime string back to datetime object 
Predicted summary:  how to get the current directory in python


Post Code: sum len for j in list items 
Original summary: averaging the values in dictionary based on the key 
Predicted summary:  how to get the first element of list in python


Post Code: zip 
Original summary: zip lists in python 
Predicted summary:  how to get the current directory in python




TODO: Most code lost its meaning after preprocess ?

In [44]:
# prediction for any new input
from tensorflow.keras.preprocessing.sequence import pad_sequences
def predict(input_string):
    input_seq = x_tokenizer.texts_to_sequences([input_string])
    input_seq = pad_sequences(input_seq, maxlen=max_code_len, padding='post')
    pred_sent = predict_summary(test_code[i].reshape(1, max_code_len))
    return pred_sent

In [45]:
predict("print last modified time ctime os path getmtime file")

' how to get the current directory in python'

In [46]:
from nltk.translate.bleu_score import sentence_bleu

In [48]:
# BLEU Score for test
bleu_score = 0
diff = 0

for i in range(len(test_code)):

    pred_summary = predict_summary(test_code[i].reshape(1, max_code_len))
    # print(pred_summary)
    # print(seq2summary(y_val[i]))

    # BLEU score
    curr_bleu = sentence_bleu(pred_summary, seq2summary(test_summary[i]).split())
#     curr_bleu = sacrebleu.sentence_bleu(pred_summary, (post_test['docstring'][i]).split(), smooth_method='exp').score
#     curr_bleu = sentence_bleu(pred_summary.split(), post_test['docstring'][i].split())
#     print(curr_bleu)
    bleu_score += curr_bleu

    # Check if below threshold
    # if curr_bleu < 5:
    #    diff += 1

# average_test_bleu_score = bleu_score / (len(test_code) - diff)
average_test_bleu_score = bleu_score / (len(test_code))
print("Average BLEU Score on val:", average_test_bleu_score)

0.01
avrg BLEU

In [None]:
# WARNING
# /opt/conda/lib/python3.10/site-packages/nltk/translate/bleu_score.py:490: UserWarning: 
# Corpus/Sentence contains 0 counts of 2-gram overlaps.
# BLEU scores might be undesirable; use SmoothingFunction().
#   warnings.warn(_msg)