In [22]:
import json
import ujson
import glob
import time
import multiprocessing
import re
import os
import time
from gensim.models import word2vec, Word2Vec
from gensim.models.word2vec import PathLineSentences
from itertools import repeat
from scipy import spatial
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
import codecs
from nltk.corpus import stopwords
import math
import functools
import operator

stopwords = set(stopwords.words('english'))

custom_stopwords = [s.split("'")[0] for s in stopwords]
custom_stopwords = [s for s in custom_stopwords if len(s) > 1]
print(custom_stopwords)

['had', 'any', 'it', 'they', 'that', 'but', 'should', 'until', 'other', 'shouldn', 'below', 'more', 'down', 'weren', 'so', 'own', 'hers', 'very', 'will', 'how', 'after', 'doesn', 'or', 'its', 'am', 'she', 'aren', 'their', 'than', 'his', 'few', 'over', 'hasn', 'you', 'didn', 'couldn', 'needn', 'shan', 'shouldn', 'me', 'our', 'do', 'wasn', 'her', 'don', 'why', 'who', 're', 'just', 'don', 'from', 'same', 'it', 'at', 'wouldn', 'have', 'haven', 'during', 'has', 'themselves', 'you', 'where', 'both', 'shan', 'doing', 'are', 'when', 'hadn', 'weren', 'what', 'up', 'you', 'your', 'most', 'he', 'doesn', 'there', 'isn', 'of', 'for', 'and', 'himself', 'against', 'we', 'such', 'needn', 'out', 'hasn', 'if', 'whom', 'theirs', 'above', 'which', 'no', 'not', 'my', 'didn', 'these', 'here', 'aren', 'ma', 'wasn', 'being', 'some', 'should', 'as', 've', 'herself', 'this', 'with', 'that', 'mustn', 'having', 'you', 'yourself', 'through', 'those', 'can', 'is', 'were', 'be', 'couldn', 'hadn', 'you', 'further', '

In [23]:
# load the extracted java-related GitHub data
def load_github_issues():
    with open('./output/issue-comments-revised.jsonl') as issue_comments_f:
        issue_list = []
        comments_list = []

        for line in issue_comments_f:
            obj = ujson.loads(line)

            comments_list.append({
                'body': obj['body'],
                'repo_name': obj['repo_name'],
                'html_url': obj['html_url'],
                'issue_id': obj['issue']['id']
            })

            issue = obj['issue']
            issue['repo_name'] = obj['repo_name']
            issue_list.append(issue)

        issues_df = pd.DataFrame(issue_list)
        issues_df = issues_df.drop_duplicates(subset=['id'])
        comments_df = pd.DataFrame(comments_list)
        
        return (issues_df, comments_df)


issues_df, comments_df = load_github_issues()
print("Number of issues: {}".format(len(issues_df.index)))
print("Number of comments: {}".format(len(comments_df.index)))

Number of issues: 627450
Number of comments: 1855870


In [24]:
# find the issues that contain code blocks
code_issues_df = issues_df.dropna(subset=['body'])
code_issues_df = code_issues_df[code_issues_df['body'].str.contains('```')]
print("Number of issues with code block/s: {}".format(len(code_issues_df.index)))

Number of issues with code block/s: 85318


In [25]:
code_issues_df.head()

Unnamed: 0,body,html_url,id,number,repo_name,title
2,https://github.com/jooby-project/jooby/blob/ma...,https://github.com/jooby-project/jooby/issues/965,284439800,965,jooby-project/jooby,2nd thymeleaf code snippet (in the documentati...
3,When updating Spotless from `3.6.0` to `3.7.0`...,https://github.com/diffplug/spotless/issues/182,285279535,182,diffplug/spotless,Unable to store input properties... when upgra...
10,I'm using Google Guava [Preconditions](https:/...,https://github.com/uber/NullAway/issues/47,268767363,47,uber/NullAway,NullAway doesn't recognize Guava Preconditions...
19,> Citing added javadoc:\r\n\r\nImmutables appl...,https://github.com/immutables/immutables/issue...,285281793,740,immutables/immutables,Style-level fence for annotation classpath aut...
33,"Hi, i try to build ffmpeg for windows 32 bit \...",https://github.com/bytedeco/javacpp-presets/is...,285279281,503,bytedeco/javacpp-presets,Build for windows-x86 not working


In [38]:
block_comments_df = comments_df[comments_df['issue_id'].isin(code_issues_df['id'])].copy()
print('Number of comments for issues with code block/s: {}'.format(len(block_comments_df.index)))

Number of comments for issues with code block/s: 290019


In [32]:
def preprocess(doc):
    # remove all code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
            
    # remove urls
    doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', doc)
    
    # remove line break characters
    doc = re.sub(r'[\r\n]', ' ', doc)
    
    # remove apostrophes/suffixes
    doc = re.sub(r"'\w |\w' ", ' ', doc)
    
    # remove numbers
    doc = re.sub(r'(\d\.?)+', ' ', doc)
        
    # replace all punctuation except for full stop with space
    doc = re.sub(r'[^A-Za-z\.]', ' ', doc)
        
    # normalise full stops
    doc = re.sub(r'\s\.\.+', '.', doc)
    
    # remove more than 1 whitespace
    doc = re.sub('\s\s+', ' ', doc)
    
    # remove leading and trailing whitespace
    doc = doc.strip()
    
    return doc

def sent_tokenize(doc):
    docs = doc.split('. ')
    docs = [t for t in docs if t != '']
    return docs

def doc_tokenize(doc):
    doc = re.sub('\.', ' ', doc) # remove full stops
    
    tokens = [t.lower() for t in doc.split() if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

def tokenize(sentence):
    sentence = re.sub('\.', ' ', sentence) # remove full stops
    
    tokens = sentence.split()
    tokens = [t.lower() for t in tokens if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

In [33]:
# load the non-dreprecated java doc caveat sentences
def load_caveats():
    caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
    caveats_list = []

    files = glob.glob(caveat_files_dir + '*.json')
    for file in files:
        with open(file) as f:
            arr = ujson.load(f)
            full_class_name = os.path.splitext(os.path.basename(file))[0]
            simple_class_name = full_class_name.split('.')[-1]
            for caveat in arr:
                if not caveat['deprecated'] and 'name' in caveat:
                    for sentence in caveat['sentences']:
                        caveats_list.append({
                            'simple_class_name': simple_class_name,
                            'full_class_name': full_class_name,
                            'api': caveat['name'],
                            'sentence': sentence,
                            'type': 'body'
                        })

                    # add all misc level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': misc_sentence,
                                        'type': 'misc'
                                    })

                        else:
                            for s in misc_obj['list']:
                                caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': s,
                                        'type': 'misc'
                                    })

    return pd.DataFrame(caveats_list)

caveats = load_caveats()
print('Number of caveat sentences: {}'.format(len(caveats.index)))

Number of caveat sentences: 73831


In [10]:
caveats.head()

Unnamed: 0,api,full_class_name,sentence,simple_class_name,type
0,getBlockedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body
1,getBlockedTime,java.lang.management.ThreadInfo,This statistic is reset when the thread conten...,ThreadInfo,body
2,getBlockedTime,java.lang.management.ThreadInfo,the approximate accumulated elapsed time in mi...,ThreadInfo,misc
3,getBlockedTime,java.lang.management.ThreadInfo,if the Java virtual machine does not support t...,ThreadInfo,misc
4,getWaitedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body


In [34]:
caveats['preprocessed'] = caveats['sentence'].apply(lambda x: preprocess(x))
caveats['tokens'] = caveats['preprocessed'].map(lambda x: tokenize(x))

In [8]:
# Concatenate text in the issue all comments for each issue
# issue_to_concat_text = {}
# for index, row in code_issues_df.iterrows():
#     assoc_comments_df = comments_df[comments_df['issue_id'] == row['id']]
    
#     concat_text = row['title'] + row['body']
    
#     for j, comment_row in assoc_comments_df.iterrows():
#         concat_text += row['body']
        
#     issue_to_concat_text[row['id']] = concat_text

In [14]:
# Identify the relevant issues for all caveats
# apis = set()
# for caveat in caveats_list:
#     apis.add((caveat['simple_class_name'], caveat['api']))
    
# print('Number of apis: {}'.format(len(apis)))
    
# def get_relevant_issues(api_tuple, issue_to_text_dict):
#     relevant = []
#     for id in issue_to_text_dict:
#         if api_tuple[0] in issue_to_text_dict[id]:
#             if (api_tuple[0] == api_tuple[1]) or api_tuple[1] in issue_to_text_dict[id]:
#                 relevant.append(id)
            
#     return relevant

# p = multiprocessing.Pool(2)
# relevant_issues = p.starmap(get_relevant_issues, zip(get_relevant_issues, repeat(issue_to_concat_text)))

Number of apis: 21942


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlo

In [17]:
# with open('./output/relevant_issues_for_caveats.jsonl', 'w+') as f:
#     for i, api in enumerate(apis):
#         f.write(ujson.dumps({'class': api[0], 'api': api[1], 'issue_ids': relevant_issues[i]}) + '\n')

In [39]:
def tokenize_sentence_list(sent_list):
    return [tokenize(x) for x in sent_list]

def calculate_preprocessed_comment_sentences(df):
    df['preprocessed_comments'] = df['body'].map(lambda x: preprocess(x))
    print('Preprocessed all comments...')
    
    df['tokenised_para'] = df['preprocessed_comments'].map(lambda x: doc_tokenize(x))
    print('Tokenized all paragraphs...')
    
    df['sentences'] = df['preprocessed_comments'].map(lambda x: sent_tokenize(x))
    print('Completed sentence tokenization...')
    
    df['tokenised_sentences'] = df['sentences'].map(lambda x: tokenize_sentence_list(x))
    print('Tokenized all sentences...')
                            
calculate_preprocessed_comment_sentences(block_comments_df)

Preprocessed all comments...
Tokenized all paragraphs...
Completed sentence tokenization...
Tokenized all sentences...


In [40]:
# Get all comments that contain non-empty preprocessed sentences
preprocessed_comments_df = block_comments_df[block_comments_df.astype(str)['tokenised_sentences'] != '[]']
print(len(block_comments_df.index))
print(len(preprocessed_comments_df.index))

290019
286010


In [41]:
# write the comment sentences to file, alongside relevant info to retrieve df row later
with open('./output/issue-comment-sentences-new.txt', 'w+') as f_out_sents, \
    open('./output/associated-sentence-df-index-new.txt', 'w+') as f_out_index, \
    open('./output/associated-sentence-issue-ids-new.txt', 'w+') as f_out_id:
        
    sent_str = ''
    index_str = ''
    id_str = ''
    
    c = 0
    for i in preprocessed_comments_df.index:
        issue_id = preprocessed_comments_df.loc[i, 'issue_id']
        sentences = preprocessed_comments_df.loc[i, 'tokenised_sentences']
        for sentence in sentences:
            sent_str += ' '.join(sentence) + '\n'
            index_str += str(i) + '\n'
            id_str += str(issue_id) + '\n'
            c += 1
            
            if c >= 100000:
                f_out_sents.write(sent_str)
                f_out_index.write(index_str)
                f_out_id.write(id_str)
                sent_str = ''
                index_str = ''
                id_str = ''
                
                c = 0
    
    if len(sent_str) > 0:
        f_out_sents.write(sent_str)
    if len(index_str) > 0:
        f_out_index.write(index_str)
    if len(id_str) > 0:
        f_out_id.write(id_str)    

In [42]:
cores = multiprocessing.cpu_count()

start = time.clock()
model = word2vec.Word2Vec(PathLineSentences('./output/issue-comment-sentences-new.txt'), size=100, window=5, min_count=5, workers=cores-1, iter=1, sg=1)
end =time.clock()
training_time=end-start
print('end training and cost ' + str(training_time)+ ' s')

end training and cost 69.08170100000007 s


In [43]:
# save word2vec model
model.save('./output/word2vec-new.model')
model.wv.save_word2vec_format('./output/word2vec-new.txt')

In [44]:
# calculate idf
idf = {}
with open('./output/issue-comment-sentences-new.txt') as f:
    start = time.clock()
    lines = f.readlines()
    print('Finished reading sentences from file...')    
    vocab = list(Word2Vec.load('./output/word2vec-new.model').wv.vocab.keys())
    N = len(lines)
    docs = [sentence.split() for sentence in lines]
    
    for doc in docs:
        for word in set(doc):
            if word not in idf:
                idf[word] = 1
            else:
                idf[word] += 1
    
    for word in idf:
        idf[word] = math.log(N / float(idf[word] + 1))
    
    end = time.clock()
    training_time=end-start
    print('IDF computation time: {}s'.format(training_time))

Finished reading sentences from file...
IDF computation time: 5.940250999999989s


In [45]:
s_avg = 0 # avg doc length
with open('./output/issue-comment-sentences-new.txt','r', encoding='utf-8') as f:
    lines = f.readlines()
    
    doc_lengths = [len(line.split()) for line in lines]
    s_avg = sum(doc_lengths) / len(doc_lengths)
    print("average document length: {}".format(s_avg)) 

# Calculate combination scores of word2vec and bm25
def bm25(doc, s2, idf):
    score = 0
    k1 = 1.5
    b = 0.75

    for w in doc:
        idf_s = idf.get(w, 1)
        bm25_ra = s2.count(w) * (k1 + 1)
        bm25_rb = s2.count(w) + k1 * (1 - b + b * len(s2) / s_avg)
        score += idf_s * (bm25_ra / bm25_rb)
    return score

def compute(s1, s2, voc):   
    v2 = np.array([voc[s] for s in s2 if s in voc])
    v2 = v2.sum(axis=0)

    v1 = np.array([voc[s] for s in s1 if s in voc])
    v1 = v1.sum(axis=0)
    
    return 1 - spatial.distance.cosine(v1, v2)

def cosine(sentences, s2, voc):
    s1_df_score = pd.Series(sentences)
    s1_df_score = s1_df_score.map(lambda x: compute(x, s2, voc))
    
    s1_df_score.dropna(inplace=True)
    return s1_df_score.sort_values(ascending=False).head(1)
        
def load_voc(file_voc):
    vector_file = codecs.open(file_voc, 'r', encoding='utf-8')
    line = vector_file.readline()
    voc_size, vec_dim = map(int, line.split(' '))
    embedding = dict()
    line = vector_file.readline()
    while line:
        items = line.split(' ')
        item = items[0]
        vec = np.array(items[1:], dtype='float32')
        embedding[item] = vec
        line = vector_file.readline()
    return embedding

average document length: 10.688945395875214


In [46]:
vocab = load_voc('./output/word2vec-new.txt')
    
relevant_issues = {}
with open('./output/relevant_issues_for_caveats.jsonl') as f:
    for line in f:
        obj = ujson.loads(line)
        relevant_issues[(obj['class'], obj['api'])] = obj['issue_ids']
print('Number of APIs with at least 1 relevant issue: {}'.format(len(relevant_issues)))

Number of APIs with at least 1 relevant issue: 21942


In [47]:
# TF-IDF score computation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_tf_idf_query_similarity(vectorizer, vecs, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_tfidf, vecs).flatten()
    return sim_scores

In [48]:
tf_idf_vectorizer = None
tf_idf_vectors = None
with open('./output/issue-comment-sentences-new.txt') as f:
    docs = f.readlines()
    
    tf_idf_vectorizer = TfidfVectorizer(lowercase=None)
    tf_idf_vectors = tf_idf_vectorizer.fit_transform(docs)
    print("Computed TF-IDF vectors for all documents...")

Computed TF-IDF vectors for all documents...


In [49]:
# create dicts to map issue sentences to metadata
issue_id_to_sentence_index = {}
with open('./output/associated-sentence-issue-ids-new.txt') as f2:
    ids = f2.readlines()
    ids = [int(x) for x in ids]
    
    for index, id in enumerate(ids):
        if not id in issue_id_to_sentence_index:
            issue_id_to_sentence_index[id] = []
        issue_id_to_sentence_index[id].append(index)
        
sentence_index_to_df_index = {}
with open('./output/associated-sentence-df-index-new.txt') as f:
    indices = f.readlines()
    indices = [int(x) for x in indices]

    for index, df_index in enumerate(indices):
        sentence_index_to_df_index[index] = df_index

In [50]:
start = time.clock()

with open('./output/tfidf_results_new.jsonl', 'w+') as f:
    complete_tfidf_sim_results = ''
    c = 0
    # calculate scores for each caveat sentence
    for i in caveats.index:
        key = (caveats.loc[i,'simple_class_name'], caveats.loc[i,'api'])
        if key in relevant_issues:
            issue_ids = relevant_issues[key]

            if len(issue_ids) > 0:
                indices = []

                for id in issue_ids:
                    if id in issue_id_to_sentence_index:
                        indices += issue_id_to_sentence_index[id]

                if len(indices) > 0:
                    relevant_vecs = tf_idf_vectors[indices,:]
                    sim_scores = get_tf_idf_query_similarity(tf_idf_vectorizer, relevant_vecs, ' '.join(caveats.loc[i, 'tokens']))

                    scores = []
                    for j, score in enumerate(sim_scores):
                        if score > 0:
                            comment_index = sentence_index_to_df_index[indices[j]]
                            scores.append({
                                'score': score,
                                'sentence_index': indices[j]
                            })

                    if len(scores) > 0:
                        complete_tfidf_sim_results += ujson.dumps({
                            'simple_class_name': key[0],
                            'api': key[1],
                            'caveat_sentence': caveats.loc[i, 'sentence'],
                            'caveat_sentence_id': i,
                            'tfidf_sim_scores': scores
                        }) + '\n'

                        c += 1

                        if c >= 2000:
                            f.write(complete_tfidf_sim_results)

                            c = 0
                            complete_tfidf_sim_results = ''
                        
    if len(complete_tfidf_sim_results) > 0:
        f.write(complete_tfidf_sim_results)
           
    end = time.clock()
    training_time=end-start
    print('Cosine similarity for TF-IDF vectors computation time: {}s'.format(training_time))

Cosine similarity for TF-IDF vectors computation time: 306.95871099999977s


In [51]:
with open('./output/combined_sim_results_new.jsonl', 'w+') as f_combo_out, \
    open('./output/word2vec_results_new.jsonl', 'w+') as f_w2v_out, \
    open('./output/bm25_results_new.jsonl', 'w+') as f_bm25_out, \
    open('./output/ir_error_log_new.jsonl', 'w+') as f_err:
        
    start = time.clock()
    complete_combined_results = ''
    complete_bm25_results = ''
    complete_word2vec_results = ''
    errors = ''
    c=0
    
    for i in caveats.index:
        key = (caveats.loc[i,'simple_class_name'], caveats.loc[i,'api'])
        if key in relevant_issues:
            issue_ids = relevant_issues[key]

            if len(issue_ids) > 0:
                try:
                    caveat_sent = caveats.loc[i,'tokens']
                    combined_sim_results = []
                    bm25_results = []
                    word2vec_results = []

                    # retrieve issue comment sentences that are relevant
                    relevant_comments_df = preprocessed_comments_df[preprocessed_comments_df['issue_id'].isin(issue_ids)]

                    sim_w2v = relevant_comments_df['tokenised_sentences'].apply(cosine, s2=caveat_sent, voc=vocab)
                    sim_bm25 = relevant_comments_df['tokenised_para'].apply(bm25, s2=caveat_sent, idf=idf)

                    sim_bm25 = (sim_bm25 - sim_bm25.min()) / (sim_bm25.max() - sim_bm25.min())
                    sim_bm25 = pd.to_numeric(sim_bm25, downcast='float')
                    sim_w2v = pd.to_numeric(sim_w2v[0], downcast='float')

                    if len(sim_bm25) != 0 or len(sim_w2v) != 0:
                        # word2vec cosine similarity
                        for j in sim_w2v.index:
                            word2vec_results.append({
                                'score': str(sim_w2v[j]),
                                'issue_number': int(relevant_comments_df.loc[j, 'issue_id']),
                                'comment': relevant_comments_df.loc[j,'body']
                            })

                        # bm25 score
                        for j in sim_bm25.index:
                            bm25_results.append({
                                'score': str(sim_bm25[j]),
                                'issue_number': int(relevant_comments_df.loc[j, 'issue_id']),
                                'comment': relevant_comments_df.loc[j,'body']
                            })

                        # calculate combination similarity score
                        combined_sim = 0.5 * sim_bm25.add(0.5 * sim_w2v, fill_value=0)
                        for j in combined_sim.index:
                            combined_sim_results.append({
                                'issue_number': int(relevant_comments_df.loc[j,'issue_id']),
                                'comment': relevant_comments_df.loc[j,'body'],
                                'score': str(combined_sim[j]),
                            })

                        # Write results to relevant files
                        if len(word2vec_results) > 0:
                            complete_word2vec_results += ujson.dumps({
                                'simple_class_name': key[0],
                                'api': key[1],
                                'caveat_sentence': caveats.loc[i, 'sentence'],
                                'caveat_sentence_id': i,
                                'w2v_results': word2vec_results
                            }) + '\n'

                        if len(bm25_results) > 0:
                            complete_bm25_results += ujson.dumps({
                                'simple_class_name': key[0],
                                'api': key[1],
                                'caveat_sentence': caveats.loc[i, 'sentence'],
                                'caveat_sentence_id': i,
                                'bm25_results': bm25_results
                            }) + '\n'

                        if len(combined_sim_results) > 0:
                            complete_combined_results += ujson.dumps({
                                'simple_class_name': key[0],
                                'api': key[1],
                                'caveat_sentence': caveats.loc[i, 'sentence'],
                                'caveat_sentence_id': i,
                                'combination_results': combined_sim_results
                            }) + '\n'
                        
                        c+=1
                        
                        # write all results to file
                        if c >= 2000:
                            c = 0
                            
                            f_combo_out.write(complete_combined_results)
                            f_bm25_out.write(complete_bm25_results)
                            f_w2v_out.write(complete_word2vec_results)
                            
                            # reset output strings
                            complete_combined_results = ''
                            complete_bm25_results = ''
                            complete_word2vec_results = ''
                            
                            break
                            
                except Exception as e:
                    errors += ujson.dumps({'caveat_index': i, 'error': e}) + '\n'
    
    # write any buffered results remaining
    if len(complete_combined_results) > 0:
        f_combo_out.write(complete_combined_results)
    if len(complete_bm25_results) > 0:
        f_bm25_out.write(complete_bm25_results)
    if len(complete_word2vec_results) > 0:
        f_w2v_out.write(complete_word2vec_results)
        
    # write error log
    f_err.write(errors)
    
    end = time.clock()
    training_time=end-start
    print('Similarity computation time: {}s'.format(training_time))

Similarity computation time: 1137.973101s
