In [165]:
%config IPCompleter.greedy=True
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from nltk.translate.bleu_score import sentence_bleu
import scipy.sparse
import json

In [166]:
k_best = 5

def get_line_from_file_by_id(filename, id):
    with open(filename) as inf:
        for i, line in enumerate(inf):
            if i == id:
                return line
            
def extract_train_diff(id):
    return get_line_from_file_by_id('data/train.diff', id)
            
def extract_target_commit_message(id):
    return get_line_from_file_by_id('data/train.msg', id)

In [167]:
new_diffs = []
with open('data/test.diff', 'r') as inf:
    for line in inf:
        new_diffs.append(line.strip())
#new_diffs

In [168]:
#load vocabulary
with open('data/vocabulary.json', 'r') as inf:
    vocabulary = json.load(inf) 
vectorizer = CountVectorizer(vocabulary=vocabulary, token_pattern=r'\S+', stop_words=['<nl>'])

analyzer = vectorizer.build_analyzer()

#load bag-of-words matrix
train_bow_matrix = scipy.sparse.load_npz('data/bow_matrix.npz')
#train_bow_matrix.toarray()

In [169]:
newdiff_bow_matrix = vectorizer.transform(new_diffs)
newdiff_bow_matrix.shape

(3000, 59678)

In [171]:
cosine_matrix = cosine_distances(newdiff_bow_matrix, train_bow_matrix)
cosine_matrix.shape

(3000, 26208)

In [173]:
candidates_indices = [(row.argsort()[:k_best]).tolist() for row in cosine_matrix]
len(candidates_indices[0]), len(candidates_indices)

(5, 3000)

In [177]:
def choose_best(diff, ids):
    best_bleu = 0.0
    best_id = ids[0]
    for id in ids:
        reference = extract_train_diff(id)
        score = sentence_bleu([analyzer(reference)], analyzer(diff))
        if score > best_bleu:
            best_bleu = score
            best_id = id
    return best_id

messages_id = []
for i, diff in enumerate(new_diffs):
    cur_id = choose_best(diff, candidates_indices[i])
    messages_id.append(cur_id)
    if i % 100 == 0:
        print(i)
    
#print(new_diffs, extract_train_diff(0), extract_train_diff(1), extract_train_diff(2), extract_train_diff(3))
    
len(messages_id)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0
100


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


3000

In [178]:
with open('data/generated.msg', 'w') as ouf:
    for i, message_id in enumerate(messages_id):
        message = extract_target_commit_message(message_id)
        ouf.write(message)
        if i % 100 == 0:
            print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


In [179]:
analyzer('updated CHANGELOG for 2 . 1 . 0 - SNAPSHOT .')

['updated', 'changelog', 'for', '2', '.', '1', '.', '0', '-', 'snapshot', '.']