In [1]:
import json
import ujson
import glob
import time
import multiprocessing
import re
import os
import time
from gensim.models import word2vec, Word2Vec
from gensim.models.word2vec import PathLineSentences
from itertools import repeat
from scipy import spatial
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
import codecs
from nltk.corpus import stopwords
import math
import functools
import operator
from random import sample

stopwords = set(stopwords.words('english'))

custom_stopwords = [s.split("'")[0] for s in stopwords]
custom_stopwords = [s for s in custom_stopwords if len(s) > 1]
print(custom_stopwords)

['don', 'more', 'don', 'theirs', 'those', 'does', 'you', 'there', 'needn', 'herself', 'doesn', 'over', 'his', 'was', 'she', 'after', 'he', 'our', 'her', 'just', 'will', 'of', 'same', 'against', 'during', 'down', 'them', 'ourselves', 'below', 'into', 'some', 'hadn', 'shouldn', 'has', 'when', 'who', 'such', 'than', 'you', 'you', 'having', 'here', 'shan', 'whom', 'doesn', 'won', 'most', 'these', 'other', 'each', 'so', 'should', 'you', 'wasn', 'wasn', 'it', 'didn', 'its', 'any', 'this', 'between', 'shouldn', 'isn', 'do', 'did', 'aren', 'an', 'itself', 'how', 'too', 'for', 'off', 'couldn', 'is', 'she', 'at', 'which', 'all', 'what', 'it', 'myself', 'then', 'mightn', 'few', 'weren', 'that', 'can', 'only', 'while', 'had', 'being', 'no', 'under', 'himself', 'up', 'weren', 'about', 'hers', 'my', 'both', 'didn', 'be', 'on', 'll', 'hasn', 'through', 'that', 'yourselves', 'haven', 'why', 'are', 'if', 'your', 'from', 'in', 'by', 'couldn', 'ma', 'their', 'and', 'hadn', 'me', 're', 'now', 'we', 'they'

In [2]:
# load the extracted java-related GitHub data
def load_github_issues():
    with open('./output/issue-comments-revised.jsonl') as issue_comments_f:
        issue_list = []
        comments_list = []

        for line in issue_comments_f:
            obj = ujson.loads(line)

            comments_list.append({
                'body': obj['body'],
                'repo_name': obj['repo_name'],
                'html_url': obj['html_url'],
                'issue_id': obj['issue']['id']
            })

            issue = obj['issue']
            issue['repo_name'] = obj['repo_name']
            issue_list.append(issue)

        issues_df = pd.DataFrame(issue_list)
        issues_df = issues_df.drop_duplicates(subset=['id'])
        comments_df = pd.DataFrame(comments_list)
        
        return (issues_df, comments_df)


issues_df, comments_df = load_github_issues()
print("Number of issues: {}".format(len(issues_df.index)))
print("Number of comments: {}".format(len(comments_df.index)))

Number of issues: 627450
Number of comments: 1855870


In [3]:
# find the issues that contain code blocks
code_issues_df = issues_df.dropna(subset=['body'])
code_issues_df = code_issues_df[code_issues_df['body'].str.contains('```')]
print("Number of issues with code block/s: {}".format(len(code_issues_df.index)))

Number of issues with code block/s: 85318


In [4]:
code_issues_df.head()

Unnamed: 0,body,html_url,id,number,repo_name,title
2,https://github.com/jooby-project/jooby/blob/ma...,https://github.com/jooby-project/jooby/issues/965,284439800,965,jooby-project/jooby,2nd thymeleaf code snippet (in the documentati...
3,When updating Spotless from `3.6.0` to `3.7.0`...,https://github.com/diffplug/spotless/issues/182,285279535,182,diffplug/spotless,Unable to store input properties... when upgra...
10,I'm using Google Guava [Preconditions](https:/...,https://github.com/uber/NullAway/issues/47,268767363,47,uber/NullAway,NullAway doesn't recognize Guava Preconditions...
19,> Citing added javadoc:\r\n\r\nImmutables appl...,https://github.com/immutables/immutables/issue...,285281793,740,immutables/immutables,Style-level fence for annotation classpath aut...
33,"Hi, i try to build ffmpeg for windows 32 bit \...",https://github.com/bytedeco/javacpp-presets/is...,285279281,503,bytedeco/javacpp-presets,Build for windows-x86 not working


In [5]:
block_comments_df = comments_df[comments_df['issue_id'].isin(code_issues_df['id'])].copy()
print('Number of comments for issues with code block/s: {}'.format(len(block_comments_df.index)))

Number of comments for issues with code block/s: 290019


In [6]:
def preprocess(doc):
    # remove all code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
            
    # remove urls
    doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', doc)
    
    # remove line break characters
    doc = re.sub(r'[\r\n]', ' ', doc)
    
    # remove apostrophes/suffixes
    doc = re.sub(r"'\w |\w' ", ' ', doc)
    
    # remove numbers
    doc = re.sub(r'(\d\.?)+', ' ', doc)
        
    # replace all punctuation except for full stop with space
    doc = re.sub(r'[^A-Za-z\.]', ' ', doc)
        
    # normalise full stops
    doc = re.sub(r'\s\.\.+', '.', doc)
    
    # remove more than 1 whitespace
    doc = re.sub('\s\s+', ' ', doc)
    
    # remove leading and trailing whitespace
    doc = doc.strip()
    
    return doc

def sent_tokenize(doc):
    docs = doc.split('. ')
    docs = [t for t in docs if t != '']
    return docs

def doc_tokenize(doc):
    doc = re.sub('\.', ' ', doc) # remove full stops
    
    tokens = [t.lower() for t in doc.split() if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

def tokenize(sentence):
    sentence = re.sub('\.', ' ', sentence) # remove full stops
    
    tokens = sentence.split()
    tokens = [t.lower() for t in tokens if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

In [8]:
# load the non-dreprecated java doc caveat sentences
def load_caveats():
    caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
    caveats_list = []

    files = glob.glob(caveat_files_dir + '*.json')
    for file in files:
        with open(file) as f:
            arr = ujson.load(f)
            full_class_name = os.path.splitext(os.path.basename(file))[0]
            simple_class_name = full_class_name.split('.')[-1]
            for caveat in arr:
                if not caveat['deprecated'] and 'name' in caveat:
                    for sentence in caveat['sentences']:
                        caveats_list.append({
                            'simple_class_name': simple_class_name,
                            'full_class_name': full_class_name,
                            'api': caveat['name'],
                            'sentence': sentence,
                            'type': 'body'
                        })

                    # add all misc level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    sentence = ''
                                    if misc_obj['name'] == 'Parameters:':
                                        sentence = obj['parameter'] + ' ' + misc_sentence
                                    else:
                                        sentence = obj['exception'] + ' ' + misc_sentence
                                    caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': sentence,
                                        'type': 'misc'
                                    })

                        else:
                            for s in misc_obj['list']:
                                caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': s,
                                        'type': 'misc'
                                    })

    return pd.DataFrame(caveats_list)

caveats = load_caveats()
print('Number of caveat sentences: {}'.format(len(caveats.index)))

Number of caveat sentences: 73831


In [9]:
caveats['preprocessed'] = caveats['sentence'].apply(lambda x: preprocess(x))
caveats['tokens'] = caveats['preprocessed'].map(lambda x: tokenize(x))

In [279]:
caveats.head()

Unnamed: 0,api,full_class_name,sentence,simple_class_name,type,preprocessed,tokens
0,getBlockedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body,This method returns if thread contention monit...,"[method, returns, thread, contention, monitori..."
1,getBlockedTime,java.lang.management.ThreadInfo,This statistic is reset when the thread conten...,ThreadInfo,body,This statistic is reset when the thread conten...,"[statistic, reset, thread, contention, monitor..."
2,getBlockedTime,java.lang.management.ThreadInfo,the approximate accumulated elapsed time in mi...,ThreadInfo,misc,the approximate accumulated elapsed time in mi...,"[approximate, accumulated, elapsed, time, mill..."
3,getBlockedTime,java.lang.management.ThreadInfo,UnsupportedOperationException if the Java virt...,ThreadInfo,misc,UnsupportedOperationException if the Java virt...,"[unsupportedoperationexception, java, virtual,..."
4,getWaitedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body,This method returns if thread contention monit...,"[method, returns, thread, contention, monitori..."


In [42]:
block_comments_df.head()

Unnamed: 0,body,html_url,issue_id,repo_name,preprocessed_comments,tokenised_para,sentences,tokenised_sentences
2,Just an example to see how to use the `thymel...,https://github.com/jooby-project/jooby/issues/...,284439800,jooby-project/jooby,Just an example to see how to use the thymelea...,"[example, see, use, thymeleaf, api, need, test...",[Just an example to see how to use the thymele...,"[[example, see, use, thymeleaf, api, need, tes..."
3,@nedtwigg Would it be practical for you to upg...,https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,nedtwigg Would it be practical for you to upgr...,"[nedtwigg, would, practical, upgrade, propriet...",[nedtwigg Would it be practical for you to upg...,"[[nedtwigg, would, practical, upgrade, proprie..."
7,"Nope - 4.4.1 brings a different, unrelated pro...",https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,Nope brings a different unrelated problem,"[nope, brings, different, unrelated, problem]",[Nope brings a different unrelated problem],"[[nope, brings, different, unrelated, problem]]"
10,`Objects.requireNonNull` should also be suppor...,https://github.com/uber/NullAway/issues/47#iss...,268767363,uber/NullAway,Objects.requireNonNull should also be supported.,"[objects, requirenonnull, also, supported]",[Objects.requireNonNull should also be support...,"[[objects, requirenonnull, also, supported]]"
14,"Darn, that's a shame. :<\r\n\r\nNothing comes ...",https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,Darn that a shame. Nothing comes to mind yet b...,"[darn, shame, nothing, comes, mind, yet, might...","[Darn that a shame, Nothing comes to mind yet ...","[[darn, shame], [nothing, comes, mind, yet, mi..."


In [7]:
def tokenize_sentence_list(sent_list):
    return [tokenize(x) for x in sent_list]

def calculate_preprocessed_comment_sentences(df):
    df['preprocessed_comments'] = df['body'].map(lambda x: preprocess(x))
    print('Preprocessed all comments...')
    
    df['tokenised_para'] = df['preprocessed_comments'].map(lambda x: doc_tokenize(x))
    print('Tokenized all paragraphs...')
    
    df['sentences'] = df['preprocessed_comments'].map(lambda x: sent_tokenize(x))
    print('Completed sentence tokenization...')
    
    df['tokenised_sentences'] = df['sentences'].map(lambda x: tokenize_sentence_list(x))
    print('Tokenized all sentences...')


# calculate_preprocessed_comment_sentences(block_comments_df)

Preprocessed all comments...
Tokenized all paragraphs...
Completed sentence tokenization...
Tokenized all sentences...


In [8]:
calculate_preprocessed_comment_sentences(comments_df)
print(len(comments_df.index))

Preprocessed all comments...
Tokenized all paragraphs...
Completed sentence tokenization...
Tokenized all sentences...
1855870


In [11]:
# Get all comments that contain non-empty preprocessed sentences
preprocessed_comments_df = block_comments_df[block_comments_df.astype(str)['tokenised_sentences'] != '[]'].copy()
preprocessed_comments_df['original_index'] = preprocessed_comments_df.index
print(len(block_comments_df.index))
print(len(preprocessed_comments_df.index))

290019
286010


In [12]:
# Determine which comments are relevant for each caveat
# Note: apply a class-name-must-also-appear-in-text restriction on apis that are found 
# in at least <ambigious_cutoff> comments to reduce computation later in bm25/w2v

class_and_apis = set()
relevant_comments_dict = {} # map apis to number of relevant comments
ambigious_cutoff = 1000 # number of comments before an api is considered ambiguous

for i in caveats.index:
    pair = (re.sub('<.*', '', caveats.loc[i, 'simple_class_name'].lower()), caveats.loc[i, 'api'].lower())
    class_and_apis.add(pair)

classes = set([a for a, b in class_and_apis])
apis = set([b for a, b in class_and_apis])

start = time.clock()
for i in preprocessed_comments_df.index:
    tokens = preprocessed_comments_df.loc[i, 'tokenised_para']
    
    for token in tokens:
        if token in apis:
            if not token in relevant_comments_dict:
                relevant_comments_dict[token] = set()
            relevant_comments_dict[token].add(i)
print("Completed relevant comment map for all APIs...")

ambiguous_apis = {} # map apis to list of possible classes
for api in relevant_comments_dict:
    if len(relevant_comments_dict[api]) > ambigious_cutoff:
        for c in classes:
            if (c, api) in class_and_apis:
                if not api in ambiguous_apis:
                    ambiguous_apis[api] = set()
                ambiguous_apis[api].add(c)
print("Total of {} ambiguous APIs found...".format(len(ambiguous_apis)))
                
relevant_comments_dict = {} # map api to set of relevant comment indices
restricted_relevant_comments_dict = {} # map <class, api> pairs to set of relevant comment indices
         
for i in preprocessed_comments_df.index:
    tokens = preprocessed_comments_df.loc[i, 'tokenised_para']
    
    for token in tokens:
        if token in apis:
            if token in ambiguous_apis:
                for c in ambiguous_apis[token]:
                    if c in tokens:
                        if not (c, token) in restricted_relevant_comments_dict:
                            restricted_relevant_comments_dict[(c, token)] = set()
                        restricted_relevant_comments_dict[(c, token)].add(i)
            else:
                if not token in relevant_comments_dict:
                    relevant_comments_dict[token] = set()
                relevant_comments_dict[token].add(i)
            
print("Created mappings for all APIs to relevant comments...")
            
end =time.clock()
training_time=end-start
print('Relevant comments for each caveat calculated in ' + str(training_time)+ ' s')

Completed relevant comment map for all APIs...
Total of 213 ambiguous APIs found...
Created mappings for all APIs to relevant comments...
Relevant comments for each caveat calculated in 68.78327499999999 s


In [13]:
print(len(class_and_apis))
print(len(apis))

21932
11215


In [14]:
# limit number of comments considered relevant
for key in restricted_relevant_comments_dict:
    if len(restricted_relevant_comments_dict[key]) >= ambigious_cutoff:
        restricted_relevant_comments_dict[key] = \
            sample(restricted_relevant_comments_dict[key], ambigious_cutoff)

In [352]:
# write the comment sentences to file, alongside relevant info to retrieve df row later
with open('./output/issue-comment-sentences-new.txt', 'w+') as f_out_sents, \
    open('./output/comment_index_to_sentence_index.jsonl', 'w+') as f_out_index:
        
    sent_str = ''
    index_str = ''
    
    c = 0
    for i in preprocessed_comments_df.index:
        sentences = preprocessed_comments_df.loc[i, 'tokenised_sentences']
        sentence_indices = []
        for sentence in sentences:
            sent_str += ' '.join(sentence) + '\n'
            sentence_indices.append(c)
            c += 1
            
        index_str += ujson.dumps({'comment_index': i, 'sentence_indices': sentence_indices}) + '\n'
    
    f_out_sents.write(sent_str)
    f_out_index.write(index_str)  

In [234]:
cores = multiprocessing.cpu_count()

start = time.clock()
model = word2vec.Word2Vec(PathLineSentences('./output/issue-comment-sentences-new.txt'), size=100, window=5, min_count=5, workers=cores-1, iter=1, sg=1)
end =time.clock()
training_time=end-start
print('end training and cost ' + str(training_time)+ ' s')

end training and cost 69.31662900000083 s


In [235]:
# save word2vec model
model.save('./output/word2vec-new.model')
model.wv.save_word2vec_format('./output/word2vec-new.txt')

In [17]:
# calculate idf
idf = {}
with open('./output/issue-comment-sentences-new.txt') as f:
    start = time.clock()
    lines = f.readlines()
    print('Finished reading sentences from file...')    
    vocab = list(Word2Vec.load('./output/word2vec-new.model').wv.vocab.keys())
    N = len(lines)
    docs = [sentence.split() for sentence in lines]
    
    for doc in docs:
        for word in set(doc):
            if word not in idf:
                idf[word] = 1
            else:
                idf[word] += 1
    
    for word in idf:
        idf[word] = math.log(N / float(idf[word] + 1))
    
    end = time.clock()
    training_time=end-start
    print('IDF computation time: {}s'.format(training_time))

Finished reading sentences from file...
IDF computation time: 4.449582000000021s


In [18]:
s_avg = 0 # avg doc length
with open('./output/issue-comment-sentences-new.txt','r', encoding='utf-8') as f:
    lines = f.readlines()
    
    doc_lengths = [len(line.split()) for line in lines]
    s_avg = sum(doc_lengths) / len(doc_lengths)
    print("average document length: {}".format(s_avg)) 

# Calculate combination scores of word2vec and bm25
def bm25(doc, s2, idf):
    score = 0
    k1 = 1.5
    b = 0.75

    for w in doc:
        idf_s = idf.get(w, 1)
        bm25_ra = s2.count(w) * (k1 + 1)
        bm25_rb = s2.count(w) + k1 * (1 - b + b * len(s2) / s_avg)
        score += idf_s * (bm25_ra / bm25_rb)
    return score

def compute(s1, s2, voc):   
    v2 = np.array([voc[s] for s in s2 if s in voc])
    v2 = v2.sum(axis=0)

    v1 = np.array([voc[s] for s in s1 if s in voc])
    v1 = v1.sum(axis=0)
    
    return 1 - spatial.distance.cosine(v1, v2)

def cosine(sentences, s2, voc):
    s1_df_score = pd.Series(sentences)
    s1_df_score = s1_df_score.map(lambda x: compute(x, s2, voc))
    
    s1_df_score.dropna(inplace=True)
    return s1_df_score.sort_values(ascending=False).head(1)
        
def load_voc(file_voc):
    vector_file = codecs.open(file_voc, 'r', encoding='utf-8')
    line = vector_file.readline()
    voc_size, vec_dim = map(int, line.split(' '))
    embedding = dict()
    line = vector_file.readline()
    while line:
        items = line.split(' ')
        item = items[0]
        vec = np.array(items[1:], dtype='float32')
        embedding[item] = vec
        line = vector_file.readline()
    return embedding

average document length: 10.688945395875214


In [19]:
# TF-IDF score computation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_tf_idf_query_similarity(vectorizer, vecs, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_tfidf, vecs).flatten()
    return sim_scores

In [20]:
vocab = load_voc('./output/word2vec-new.txt')
tf_idf_vectorizer = None
tf_idf_vectors = None
with open('./output/issue-comment-sentences-new.txt') as f:
    docs = f.readlines()
    
    tf_idf_vectorizer = TfidfVectorizer(lowercase=None)
    tf_idf_vectors = tf_idf_vectorizer.fit_transform(docs)
    print("Computed TF-IDF vectors for all documents...")

Computed TF-IDF vectors for all documents...


In [21]:
# create dict to map issue sentences to metadata
comment_index_to_sentence_indices = {}
sentence_index_to_comment_index = {}
with open('./output/comment_index_to_sentence_index.jsonl') as f:
    for line in f:
        d = ujson.loads(line)
        comment_index_to_sentence_indices[d['comment_index']] = d['sentence_indices']
        
        for index in d['sentence_indices']:
            sentence_index_to_comment_index[index] = d['comment_index']

In [23]:
start = time.clock()

with open('./output/tfidf_results.jsonl', 'w+') as f:
    complete_tfidf_sim_results = ''
    c = 0
    # calculate scores for each caveat sentence
    for i in caveats.index:       
        api = caveats.loc[i,'api'].lower()
        class_name = re.sub('<.*', '', caveats.loc[i, 'simple_class_name'].lower())
        pair = (class_name, api)
        has_relevant_comments = False
        comment_indices = []
        
        if api in relevant_comments_dict:
            comment_indices = relevant_comments_dict[api]
            has_relevant_comments = True
        elif pair in restricted_relevant_comments_dict:
            comment_indices = restricted_relevant_comments_dict[pair]
            has_relevant_comments = True
    
        comment_indices = list(comment_indices)
        
        if has_relevant_comments:
            indices = []
            for index in comment_indices:
                if index in comment_index_to_sentence_indices:
                    indices += comment_index_to_sentence_indices[index]

            if len(indices) > 0:
                relevant_vecs = tf_idf_vectors[indices,:]
                sim_scores = get_tf_idf_query_similarity(tf_idf_vectorizer, relevant_vecs, ' '.join(caveats.loc[i, 'tokens']))

                scores = {}
                for j, score in enumerate(sim_scores):
                    if score > 0:
                        comment_id = sentence_index_to_comment_index[indices[j]]
                        if not comment_id in scores or scores[comment_id]['score'] < score:
                            scores[comment_id] = {
                                'score': score,
                                'comment_id': comment_id
                            }

                if len(scores) > 0:
                    scores = [scores[key] for key in scores]
                    complete_tfidf_sim_results += ujson.dumps({
                        'caveat_id': i,
                        'scores': scores
                    }) + '\n'

                    c += 1

                    if c >= 2000:
                        f.write(complete_tfidf_sim_results)

                        c = 0
                        complete_tfidf_sim_results = ''

    if len(complete_tfidf_sim_results) > 0:
        f.write(complete_tfidf_sim_results)
           
    end = time.clock()
    training_time=end-start
    print('Cosine similarity for TF-IDF vectors computation time: {}s'.format(training_time))

Cosine similarity for TF-IDF vectors computation time: 88.99516799999998s


In [265]:
with open('./output/combined_sim_results.jsonl', 'w+') as f_combo_out, \
    open('./output/word2vec_results.jsonl', 'w+') as f_w2v_out, \
    open('./output/bm25_results.jsonl', 'w+') as f_bm25_out, \
    open('./output/ir_error_log.jsonl', 'w+') as f_err:
        
    start = time.clock()
    complete_combined_results = ''
    complete_bm25_results = ''
    complete_word2vec_results = ''
    errors = ''
    c=0
    
    for i in caveats.index:
        api = caveats.loc[i,'api'].lower()
        class_name = re.sub('<.*', '', caveats.loc[i, 'simple_class_name'].lower())
        pair = (class_name, api)
        has_relevant_comments = False
        comment_indices = []
        
        if api in relevant_comments_dict:
            comment_indices = relevant_comments_dict[api]
            has_relevant_comments = True
        elif pair in restricted_relevant_comments_dict:
            comment_indices = restricted_relevant_comments_dict[pair]
            has_relevant_comments = True
            
        comment_indices = list(comment_indices)
        
        if has_relevant_comments:
            try:
                caveat_sent = caveats.loc[i,'tokens']
                combined_sim_results = []
                bm25_results = []
                word2vec_results = []

                # retrieve issue comment sentences that are relevant
                relevant_comments_df = preprocessed_comments_df[preprocessed_comments_df.index.isin(comment_indices)]
                sim_w2v = relevant_comments_df['tokenised_sentences'].apply(cosine, s2=caveat_sent, voc=vocab)
                sim_bm25 = relevant_comments_df['tokenised_para'].apply(bm25, s2=caveat_sent, idf=idf)

                sim_bm25 = (sim_bm25 - sim_bm25.min()) / (sim_bm25.max() - sim_bm25.min())
                sim_bm25 = pd.to_numeric(sim_bm25, downcast='float')
                sim_w2v = pd.to_numeric(sim_w2v[0], downcast='float')

                if len(sim_bm25) != 0 or len(sim_w2v) != 0:
                    # word2vec cosine similarity
                    for j in sim_w2v.index:
                        if not np.isnan(sim_w2v[j]):
                            word2vec_results.append({
                                'score': float(sim_w2v[j]),
                                'comment_id': int(relevant_comments_df.loc[j, 'original_index'])
                            })

                    # bm25 score
                    for j in sim_bm25.index:
                        if not np.isnan(sim_bm25[j]):
                            bm25_results.append({
                                'score': float(sim_bm25[j]),
                                'comment_id': int(relevant_comments_df.loc[j, 'original_index'])
                            })
                    # calculate combination similarity score
                    combined_sim = 0.5 * sim_bm25.add(0.5 * sim_w2v, fill_value=0)
                    for j in combined_sim.index:
                        if not np.isnan(combined_sim[j]):
                            combined_sim_results.append({
                                'score': float(combined_sim[j]),
                                'comment_id': int(relevant_comments_df.loc[j, 'original_index'])
                            })
                    # Write results to relevant files
                    if len(word2vec_results) > 0:
                        complete_word2vec_results += ujson.dumps({
                            'caveat_id': i,
                            'scores': word2vec_results
                        }) + '\n'
                        
                    if len(bm25_results) > 0:
                        complete_bm25_results += ujson.dumps({
                            'caveat_id': i,
                            'scores': bm25_results
                        }) + '\n'

                    if len(combined_sim_results) > 0:
                        complete_combined_results += ujson.dumps({
                            'caveat_id': i,
                            'scores': combined_sim_results
                        }) + '\n'

                    c+=1

                    # write buffered results to file
                    if c >= 2000:
                        c = 0

                        f_combo_out.write(complete_combined_results)
                        f_bm25_out.write(complete_bm25_results)
                        f_w2v_out.write(complete_word2vec_results)

                        # reset output strings
                        complete_combined_results = ''
                        complete_bm25_results = ''
                        complete_word2vec_results = ''

            except Exception as e:
                errors += ujson.dumps({'caveat_index': i, 'error': e}) + '\n'
    
    # write any buffered results remaining
    if len(complete_combined_results) > 0:
        f_combo_out.write(complete_combined_results.strip())
    if len(complete_bm25_results) > 0:
        f_bm25_out.write(complete_bm25_results.strip())
    if len(complete_word2vec_results) > 0:
        f_w2v_out.write(complete_word2vec_results.strip())
        
    # write error log
    f_err.write(errors)
    
    end = time.clock()
    training_time=end-start
    print('Similarity computation time: {}s'.format(training_time))

Similarity computation time: 8282.487697999997s


In [24]:
ids_to_label = set()


def output_label_ready_file(results_path, doccano_path, output_complete_path, ids_to_label):
    with open(results_path) as f, open(doccano_path, 'w+') as f_out_docanno, \
            open(output_complete_path, 'w+') as f_out:
        
        c = 0
        results = []
        for line in f:
            obj = ujson.loads(line)
            c += len(obj['scores'])

            obj['scores'] = sorted(obj['scores'], key=lambda x: x['score'], reverse=True)
            obj['scores'] = obj['scores'][:3] # limit to 3 results per caveat
            results.append(obj)

        print('Number of results: {}'.format(len(results)))
        print('Total number of scores: {}'.format(c))
        to_label = sample(results, 384)

        for obj in to_label:
            for res in obj['scores']:
                comment_index = res['comment_id']
                caveat_index = obj['caveat_id']
                f_out.write(ujson.dumps({
                    'score': res['score'],
                    'comment': preprocessed_comments_df.loc[comment_index, 'body'],
                    'class': caveats.loc[caveat_index, 'simple_class_name'],
                    'api': caveats.loc[caveat_index,'api'],
                    'caveat': caveats.loc[caveat_index, 'sentence'],
                    'html_url': preprocessed_comments_df.loc[comment_index, 'html_url']
                }) + '\n')
                
                class_in_body = re.sub('<.*', '', caveats.loc[i, 'simple_class_name'].lower()) in preprocessed_comments_df.loc[comment_index, 'tokenised_para']
                f_out_docanno.write(ujson.dumps({
                    'text': 'contains class: ' + str(class_in_body) + '\nclass: ' +  caveats.loc[caveat_index, 'simple_class_name'] + '\napi: ' + caveats.loc[caveat_index,'api'] + \
                        '\n--------------------------------------\ncaveat: ' + caveats.loc[caveat_index,'sentence'] + \
                        '\n--------------------------------------\ncomment: '+ re.sub(r'```([^```]*)```', '', preprocessed_comments_df.loc[comment_index, 'body']),
                    'labels': ['non-relevant']
                }) + '\n')

In [25]:
# input paths
tfidf_path = './output/tfidf_results.jsonl'
w2v_path = './output/word2vec_results.jsonl'
bm25_path = './output/bm25_results.jsonl'
combo_path = './output/combined_sim_results.jsonl'

# doccano labelling paths
tfidf_docanno_path = './output/tfidf_to_label.jsonl'
w2v_docanno_path = './output/w2v_to_label.jsonl'
bm25_docanno_path = './output/bm25_to_label.jsonl'
combo_docanno_path = './output/combo_to_label.jsonl'

# output paths
tfidf_sample_path = './output/tfidf_sample.jsonl'
w2v_sample_path = './output/w2v_sample.jsonl'
bm25_sample_path = './output/bm25_sample.jsonl'
combo_sample_path = './output/combo_sample.jsonl'

output_label_ready_file(tfidf_path, tfidf_docanno_path, tfidf_sample_path)
# output_label_ready_file(w2v_path, w2v_docanno_path, w2v_sample_path)
# output_label_ready_file(bm25_path, bm25_docanno_path, bm25_sample_path)
# output_label_ready_file(combo_path, combo_docanno_path, combo_sample_path)

Number of results: 29028
Total number of scores: 1621408


In [35]:
tfidf_labelled_path = './output/labelled/tfidf.jsonl'
w2v_labelled_path = './output/labelled/w2v.jsonl'
bm25_labelled_path = './output/labelled/bm25.jsonl'
combo_labelled_path = './output/labelled/combo.jsonl'

tfidf_output_labelled_path = './labelled_data/tfidf.jsonl'
w2v_output_labelled_path = './labelled_data/w2v.jsonl'
bm25_output_labelled_path = './labelled_data/bm25.jsonl'
combo_output_labelled_path = './labelled_data/combo.jsonl'

def write_labelled_doccano_relevance(labelled_path, sample_path, output_path):
    with open(labelled_path) as f, open(sample_path) as f2, open(output_path, 'w+') as f_out:
        labelled = []
        objs = []
        
        for line in f:
            labelled.append(ujson.loads(line))
        for line in f2:
            objs.append(ujson.loads(line))
            
        for i, obj in enumerate(objs):
            if len(labelled[i]['annotations']) == 0:
                obj['label'] = 'relevant'
            else:
                obj['label'] = 'non-relevant'
            f_out.write(ujson.dumps(obj) + '\n')
            
# write_labelled_doccano_relevance(tfidf_labelled_path, tfidf_sample_path, tfidf_output_labelled_path)
# write_labelled_doccano_relevance(w2v_labelled_path, w2v_sample_path, w2v_output_labelled_path)
# write_labelled_doccano_relevance(bm25_labelled_path, bm25_sample_path, bm25_output_labelled_path)
write_labelled_doccano_relevance(combo_labelled_path, combo_sample_path, combo_output_labelled_path)

KeyError: 'annotations'