In [114]:
%config IPCompleter.greedy=True
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from nltk.translate.bleu_score import sentence_bleu
import scipy.sparse
import json

In [115]:
k_best = 5

def get_line_from_file_by_id(filename, id):
    with open(filename) as inf:
        for i, line in enumerate(inf):
            if i == id:
                return line
            
def get_line_from_filelist_by_id(filelist, id):
    return filelist[id]

with open('data/train.diff', 'r') as inf:
    train_diff_list = []
    for line in inf:
        train_diff_list.append(line)

with open('data/train.msg', 'r') as inf:
    train_msg_list = []
    for line in inf:
        train_msg_list.append(line)
            
#def extract_train_diff(id):
#    return get_line_from_file_by_id('data/train.diff', id)

def extract_train_diff(id):
    return get_line_from_filelist_by_id(train_diff_list, id)
            
#def extract_target_commit_message(id):
#    return get_line_from_file_by_id('data/train.msg', id)

def extract_target_commit_message(id):
    return get_line_from_filelist_by_id(train_msg_list, id)

In [116]:
new_diffs = []
with open('data/test.diff', 'r') as inf:
    for line in inf:
        new_diffs.append(line.strip())
#new_diffs

In [117]:
%%time
#load vocabulary
with open('data/vocabulary.json', 'r') as inf:
    vocabulary = json.load(inf) 
vectorizer = CountVectorizer(vocabulary=vocabulary, token_pattern=r'\S+', stop_words=['<nl>'])

analyzer = vectorizer.build_analyzer()

#load bag-of-words matrix
train_bow_matrix = scipy.sparse.load_npz('data/bow_matrix.npz')
#train_bow_matrix.toarray()

CPU times: user 39.4 ms, sys: 15.6 ms, total: 55 ms
Wall time: 55 ms


In [118]:
%%time
newdiff_bow_matrix = vectorizer.transform(new_diffs)
#newdiff_bow_matrix.shape

CPU times: user 185 ms, sys: 3.82 ms, total: 189 ms
Wall time: 189 ms


In [119]:
%%time
cosine_matrix = cosine_distances(newdiff_bow_matrix, train_bow_matrix)
#cosine_matrix.shape

CPU times: user 2.73 s, sys: 504 ms, total: 3.24 s
Wall time: 3.1 s


In [120]:
%%time
candidates_indices = [(row.argsort()[:k_best]).tolist() for row in cosine_matrix]
#len(candidates_indices[0]), len(candidates_indices)

CPU times: user 3.73 s, sys: 31.1 ms, total: 3.76 s
Wall time: 3.79 s


In [121]:
cosine_matrix.shape

(1796, 26208)

In [122]:
def choose_best(diff, ids):
    best_bleu = 0.0
    best_id = ids[0]
    for id in ids:
        reference = extract_train_diff(id)
        score = sentence_bleu([analyzer(reference)], analyzer(diff))
        if score > best_bleu:
            best_bleu = score
            best_id = id
    return best_id

In [123]:
%%time
messages_id = []
for i, diff in enumerate(new_diffs):
    cur_id = choose_best(diff, candidates_indices[i])
    messages_id.append(cur_id)
    if i % 100 == 0:
        print(i)
    
#print(new_diffs, extract_train_diff(0), extract_train_diff(1), extract_train_diff(2), extract_train_diff(3))
    
#len(messages_id)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
CPU times: user 8.26 s, sys: 41.1 ms, total: 8.3 s
Wall time: 8.32 s


In [124]:
with open('data/generated.msg', 'w') as ouf:
    for i, message_id in enumerate(messages_id):
        message = extract_target_commit_message(message_id)
        ouf.write(message)
        #if i % 100 == 0:
        #    print(i)