In [None]:
import ujson
import glob
import time
import multiprocessing
import re
import os
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import time
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences
from itertools import repeat
from scipy import spatial
import pandas as pd
import numpy as np
import codecs

In [2]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [7]:
# load the extracted java-related GitHub data
issue_dict = {}

with open('./output/issue-comments-revised.jsonl') as issue_comments_f:

    for line in issue_comments_f:
        obj = ujson.loads(line)
        id = obj['issue']['id']
        
        if id not in issue_dict:
            issue_dict[id] = (obj['issue'], [])
            
        issue_dict[id][1].append(obj['body'])

In [14]:
# find issues with code blocks
issue_numbers_with_code_blocks = []

for key in issue_dict:
    issue = issue_dict[key][0]
    
    if issue['body'] and '```' in issue['body']:
        issue_numbers_with_code_blocks.append(key)

In [29]:
# load the java document caveats
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats_dict = {}
class_to_caveat_ids = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat
            
            if simple_class_name not in class_to_caveat_ids:
                class_to_caveat_ids[simple_class_name] = []
            class_to_caveat_ids[simple_class_name].append(caveat['id'])

In [196]:
# identify the relevant issues for all caveats
caveat_id_to_issue_number = {}
    
for key in class_to_caveat_ids:        
    for number in issue_numbers_with_code_blocks:
        obj = issue_dict[number]
    
        issue = obj[0]
        comments = obj[1]
    
        # concatenate all text associated with the issue
        concatenated_str = issue['title'] + issue['body']
        for comment in comments:
            concatenated_str += comment

        # check if the issue or one of its comments contains the class name as a string
        if key in concatenated_str: 

            # check each api associated with the class
            for caveat_id in class_to_caveat_ids[key]:
                caveat = caveats_dict[caveat_id]

                if 'name' in caveat and caveat['name'] in concatenated_str:

                    if caveat_id not in caveat_id_to_issue_number:
                        caveat_id_to_issue_number[caveat_id] = []
                    caveat_id_to_issue_number[caveat_id].append(number)   
                    
# write the relevant caveats for all issues to file to avoid recomputation
print(len(issue_number_to_caveat_ids))
with open('./output/relevant_issues_for_caveats.jsonl', 'w+') as f:
    for key in caveat_id_to_issue_number:    
        f.write(ujson.dumps({'caveat_id': key, 
                             'caveats': caveat_id_to_issue_number[key]}) + '\n')

KeyboardInterrupt: 

In [205]:
with open('./output/relevant_issues_for_caveats_2.jsonl') as f:
    caveat_id_to_issue_number = {}
    
    for line in f:
        obj = ujson.loads(line)
        caveat_id_to_issue_number[obj['caveat_id']] = obj['issues']

In [44]:
def preprocess(doc):
    if not doc:
        return []
    
    # remove leading/ending white spaces
    doc = doc.strip()
        
    # remove all inline code or code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
    doc = re.sub(r'`([^`]*)`', '', doc)
    
    # remove more than 1 whitespace
    doc = re.sub(' +', ' ', doc)
    
    # lowercase
    doc = doc.lower()

    return sent_tokenize(doc)

In [172]:
def tokenize(sentence):   
    # remove url links
    sentence = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', sentence)
    
    # remove paths
    sentence = re.sub(r'/[^/]*(/[^/]*)*/?', '', sentence)
    
    # remove punctuation except for single space
    sentence = re.sub(r'[^A-Za-z ]', '', sentence)
         
    # return a list of the stemmed, tokenized words that are not stop words
    return [stemmer.stem(i) for i in word_tokenize(sentence) if i not in stop_words and len(i) <= 20]
    

In [191]:
# Extract issue comment sentences from all issues with code blocks
with open('./output/issue-comment-sentences.txt', 'w+') as f, \
        open('./output/sent_to_issue_number.txt', 'w+') as f_sent_to_num:
    for number in issue_numbers_with_code_blocks:
        comments = issue_dict[number][1]
        
        for comment in comments:
            sentences = preprocess(comment)
            
            for sentence in sentences:
                sentence = ' '.join(tokenize(sentence))
                if len(sentence) > 0:
                    f.write(sentence + '\n')
                    f_sent_to_num.write(str(number) + '\n')

In [58]:
cores = multiprocessing.cpu_count()

start = time.clock()
model = word2vec.Word2Vec(PathLineSentences('./output/issue-comment-sentences.txt'), size=100, window=5, min_count=5, workers=cores-1, iter=1, sg=1)
end =time.clock()
training_time=end-start
print('end training and cost ' + str(training_time)+ ' s')

end training and cost 39.47280299999966 s


In [59]:
# save word2vec model
model.save('./output/word2vec.model')
model.wv.save_word2vec_format('./output/word2vec.txt')

In [150]:
def compute_doc_counts(word, docs):
    
    c = 0
    for doc in docs:
        doc = doc.strip().split(' ')
        if word in doc:
            c += 1
            
    return c

def compute_idf(docs, vocab):
    N = len(docs)
    p = multiprocessing.Pool(cores - 1)
    counts = p.starmap(compute_doc_counts, zip(vocab, repeat(docs)))
    
    idf = {}
    for i, word in enumerate(vocab):
        idf[word] = math.log(N / float(counts[i] + 1))
    
    return idf

In [151]:
with open('./output/issue-comment-sentences.txt','r', encoding='utf-8') as f:
    lines = f.readlines()
    start = time.clock()
    
    vocab = Word2vec.load('./output/word2vec.model').wv.keys()
    idf = compute_idf(lines, vocab)
        
    end = time.clock()
    training_time=end-start
    print('IDF computation time: {}s'.format(training_time))
    
    with open('./output/idf.json', 'w+') as f:
        ujson.dump(idf, f)

IDF computation time: 72.48210000000017s


Process ForkPoolWorker-809:
Process ForkPoolWorker-810:
Process ForkPoolWorker-754:
Process ForkPoolWorker-824:
Process ForkPoolWorker-814:
Process ForkPoolWorker-827:
Process ForkPoolWorker-763:
Process ForkPoolWorker-822:
Process ForkPoolWorker-830:
Process ForkPoolWorker-818:
Process ForkPoolWorker-806:
Process ForkPoolWorker-828:
Process ForkPoolWorker-777:
Process ForkPoolWorker-802:
Process ForkPoolWorker-825:
Process ForkPoolWorker-807:
Process ForkPoolWorker-799:
Process ForkPoolWorker-832:
Process ForkPoolWorker-775:
Process ForkPoolWorker-758:
Process ForkPoolWorker-768:
Process ForkPoolWorker-801:
Process ForkPoolWorker-805:
Process ForkPoolWorker-815:
Process ForkPoolWorker-753:
Process ForkPoolWorker-804:
Process ForkPoolWorker-823:
Process ForkPoolWorker-800:
Process ForkPoolWorker-757:
Process ForkPoolWorker-796:
Process ForkPoolWorker-784:
Process ForkPoolWorker-788:
Process ForkPoolWorker-831:
Traceback (most recent call last):
Process ForkPoolWorker-790:
Process ForkP

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/p

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 25

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  F

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self.

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/e

  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/thien/anaconda2/envs/nlp/lib/py

In [282]:
s_avg = 0 # avg doc length
with open('./output/issue-comment-sentences.txt','r', encoding='utf-8') as f:
    lines = f.readlines()
    
    doc_lengths = [len(line.split(' ')) for line in lines]
    s_avg = sum(doc_lengths) / len(doc_lengths)
    

# Calculate combination scores of word2vec and bm25
def bm25(s1, s2, idf):
    bm25 = 0
    k1 = 1.5
    b = 0.75
    
    s1 = s1
    s2 = s2

    for w in s1:
        idf_s = idf.get(w, 1)
        bm25_ra = s2.count(w) * (k1 + 1)
        bm25_rb = s2.count(w) + k1 * (1 - b + b * len(s2) / s_avg)
        bm25 += idf_s * (bm25_ra / bm25_rb)
    return bm25

def compute(s1, s2, voc):
    v2 = np.array([voc[s] for s in s2 if s in voc])
    v2 = v2.sum(axis=0)
    
    v1 = np.array([voc[s] for s in s1 if s in voc])
    v1 = v1.sum(axis=0)
    sim = 1 - spatial.distance.cosine(v1, v2)
    score = sim
    return score

def cosine(s1, s2, voc):
    s1_df_score = pd.Series(s1)
    s1_df_score = s1_df_score.apply(compute, s2=s2, voc=voc)
    if type(s1_df_score.dropna()) != 'NoneType':
        if len(s1_df_score.dropna()) != 0:
            return s1_df_score.sort_values(ascending=False).head(1)
        
def load_voc(file_voc):
    vector_file = codecs.open(file_voc, 'r', encoding='utf-8')
    line = vector_file.readline()
    voc_size, vec_dim = map(int, line.split(' '))
    embedding = dict()
    line = vector_file.readline()
    while line:
        items = line.split(' ')
        item = items[0]
        vec = np.array(items[1:], dtype='float32')
        embedding[item] = vec
        line = vector_file.readline()
    return embedding

In [227]:
def caveat_preprocess(sentence):
    # remove more than 1 whitespace
    sentence = re.sub(' +', ' ', sentence)
    
    # lowercase
    sentence = sentence.lower()
    
    return sentence

# preprocess the caveat strings into tokens
caveat_id_to_tokenized_sents = {}

for id in caveat_id_to_issue_number:
    if id not in caveat_id_to_tokenized_sents:
        caveat = caveats_dict[id]

        sents = []

        for sentence in caveat['sentences']:
            sentence = caveat_preprocess(sentence)
            sentence = tokenize(sentence)

            sents.append(sentence)

        for misc in caveat['caveat_misc']:
            if misc['name'] in ['Parameters:', 'Throws:']:
                for obj in misc['list']:
                    for misc_sent in obj['sentences']:
                        misc_sent = caveat_preprocess(misc_sent)
                        misc_sent = tokenize(misc_sent)

                        sents.append(misc_sent)
            else:
                for s in misc['list']:
                    s = caveat_preprocess(s)
                    s = tokenize(s)
                    sents.append(s)
            
        caveat_id_to_tokenized_sents[id] = sents

In [228]:
print(list(caveat_id_to_issue_number.keys())[:10])

[47107, 46884, 46885, 33471, 832, 833, 834, 835, 836, 837]


In [229]:
print(list(caveat_id_to_tokenized_sents.keys())[:10])
print(caveat_id_to_tokenized_sents[47107])

[47107, 46884, 46885, 33471, 832, 833, 834, 835, 836, 837]
[['possibl', 'null', 'object', 'bound'], ['possibl', 'null', 'attribut', 'bound']]


In [None]:
vocab = load_voc('./output/word2vec.txt')
topK = 10

with open('./output/idf.json') as f:
    idf = ujson.load(f)

corr_issue = [] # issue number for the ith sentence
with open('./output/sent_to_issue_number.txt') as f:
    for line in f:
        corr_issue.append(int(line.strip()))
    
with open('./output/issue-comment-sentences.txt') as f:
    github_sents = f.readlines()
    github_sents = [s.strip().split(' ') for s in github_sents]
    
relevant_issue_sentences_dict = {}
for id in caveat_id_to_issue_number:
    relevant_sents = []
    for i, other_issue_num in enumerate(corr_issue):
        if other_issue_num in issue_numbers:
            relevant_sents.append((i, other_issue_num))
            
    relevant_issue_sentences_dict[id] = relevant_sents

In [285]:
with open('/media/thien/Data Drive1/combined_sim_results.jsonl', 'w+') as f_out, \
    open('/media/thien/Data Drive1/combined_sim_results_errors.jsonl', 'a+') as f_err:
        
    # Compute similarity for each caveat sentence and GitHub issue comment sentence
    for id in caveat_id_to_issue_number:
        issue_numbers = caveat_id_to_issue_number[id]
        
        # retrieve the sentences relevant to the api
        relevant_issue_sentences = relevant_issue_sentences_dict[id]
        
        issue_sents = [github_sents[i] for i, num in relevant_issue_sentences]
        issue_nums = [num for i, num in relevant_issue_sentences]
        df = pd.DataFrame(dict(sentence=issue_sents, issue_number=issue_nums))
        
        for caveat_sent in caveat_id_to_tokenized_sents[id]:
            try:
                sim_w2v = df['sentence'].apply(cosine, s2=caveat_sent, voc=vocab)
                sim_bm25 = df['sentence'].apply(bm25, s2=caveat_sent, idf=idf)

                sim_bm25 = (sim_bm25 - sim_bm25.min()) / \
                    (sim_bm25.max() - sim_bm25.min())
            
                sim_bm25 = pd.to_numeric(sim_bm25, downcast='float')
                sim_w2v = pd.to_numeric(sim_w2v[0], downcast='float')

                if len(sim_bm25) != 0 or len(sim_w2v) != 0:
                    combined_sim = 0.5 * sim_bm25.add(0.5 * sim_w2v, fill_value=0)

                    for index, value in combined_sim.items():
                        print(index)
                        print(value)
                        f_out.write(ujson.dumps({
                            'issue_number': df.iloc[[index]]['issue_number'],
                            'issue_sent': df.iloc[[index]]['sentence'],
                            'caveat_sent': caveat_sent,
                            'caveat_id': id,
                            'api': caveats_dict[id]['name'],
                            'score': value
                        }))
                        
            except Exception as e:
                print(e)
                f_err.write(ujson.dumps({'id': id, 'sent': caveat_sent}) + '\n')

KeyboardInterrupt: 