In [95]:
import json
import ujson
import glob
import time
import multiprocessing
import re
import os
import time
from gensim.models import word2vec, Word2Vec
from gensim.models.word2vec import PathLineSentences
from itertools import repeat
from scipy import spatial
import pandas as pd
import numpy as np
import codecs
from nltk.corpus import stopwords
import math
stopwords = set(stopwords.words('english'))

custom_stopwords = [s.split("'")[0] for s in stopwords]
custom_stopwords = [s for s in custom_stopwords if len(s) > 1]
print(custom_stopwords)

['so', 'himself', 'through', 'should', 'now', 'couldn', 'him', 'all', 'by', 'mustn', 'were', 'do', 'shan', 'their', 'down', 'yourself', 'the', 'out', 'doing', 'will', 'ma', 'themselves', 'needn', 'you', 'no', 'he', 'such', 've', 'while', 'off', 'haven', 'own', 'about', 'up', 'both', 'there', 'my', 'being', 'more', 'mightn', 'again', 'that', 'it', 'hasn', 'are', 'needn', 'during', 'herself', 'wouldn', 'until', 'these', 're', 'mustn', 'your', 'if', 'each', 'does', 'doesn', 'between', 'you', 'his', 'some', 'not', 'haven', 'weren', 'hadn', 'those', 'with', 'has', 'was', 'most', 'why', 'isn', 'am', 'yourselves', 'ours', 'ourselves', 'have', 'wasn', 'who', 'as', 'me', 'what', 'after', 'that', 'any', 'an', 'didn', 'which', 'shan', 'you', 'had', 'couldn', 'before', 'is', 'once', 'where', 'at', 'for', 'they', 'further', 'aren', 'yours', 'into', 'other', 'below', 'myself', 'than', 'but', 'on', 'won', 'against', 'here', 'isn', 'shouldn', 'ain', 'don', 'wouldn', 'from', 'her', 'wasn', 'them', 'her

In [2]:
# load the extracted java-related GitHub data

def load_github_issues():
    with open('./output/issue-comments-revised.jsonl') as issue_comments_f:
        issue_list = []
        comments_list = []

        for line in issue_comments_f:
            obj = ujson.loads(line)

            comments_list.append({
                'body': obj['body'],
                'repo_name': obj['repo_name'],
                'html_url': obj['html_url'],
                'issue_id': obj['issue']['id']
            })

            issue = obj['issue']
            issue['repo_name'] = obj['repo_name']
            issue_list.append(issue)

        issues_df = pd.DataFrame(issue_list)
        issues_df = issues_df.drop_duplicates(subset=['id'])
        comments_df = pd.DataFrame(comments_list)
        
        return (issues_df, comments_df)


issues_df, comments_df = load_github_issues()
print("Number of issues: {}".format(len(issues_df.index)))
print("Number of comments: {}".format(len(comments_df.index)))

Number of issues: 627450
Number of comments: 1855870


In [3]:
# find the issues that contain code blocks
code_issues_df = issues_df.dropna(subset=['body'])
code_issues_df = code_issues_df[code_issues_df['body'].str.contains('```')]
print("Number of issues with code block/s: {}".format(len(code_issues_df.index)))

Number of issues with code block/s: 85318


In [188]:
comments_df.head()

Unnamed: 0,body,html_url,issue_id,repo_name
0,I can reproduce this with the below test snipp...,https://github.com/elastic/elasticsearch/issue...,264716524,elastic/elasticsearch
1,I don't really like the snackbars; I think we ...,https://github.com/ritvikkar/MovieFinder/issue...,285146015,ritvikkar/MovieFinder
2,Just an example to see how to use the `thymel...,https://github.com/jooby-project/jooby/issues/...,284439800,jooby-project/jooby
3,@nedtwigg Would it be practical for you to upg...,https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless
4,fixed,https://github.com/Alex-the-666/Ice_and_Fire/i...,258569936,Alex-the-666/Ice_and_Fire


In [4]:
def preprocess(doc):
    # remove all code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
            
    # remove urls
    doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', doc)
    
    # remove line break characters
    doc = re.sub(r'[\r\n]', ' ', doc)
    
    # remove ownership apostrophe
    doc = re.sub(r"'\w |\w' ", ' ', doc)
    
    # remove numbers
    doc = re.sub(r'(\d\.?)+', ' ', doc)
        
    # replace all punctuation except for full stop with space
    doc = re.sub(r'[^A-Za-z\.]', ' ', doc)
        
    # normalise full stops
    doc = re.sub(r'\s\.\.+', '.', doc)
    
    # remove more than 1 whitespace
    doc = re.sub('\s\s+', ' ', doc)
    
    # remove leading and trailing whitespace
    doc = doc.strip()
    
    return doc

def sent_tokenize(doc):
    docs = doc.split('. ')
    docs = [t for t in docs if t != '']
    return docs

def doc_tokenize(doc):
    doc = re.sub('\.', ' ', doc) # remove full stops
    
    tokens = [t.lower() for t in doc.split() if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

def tokenize(sentence):
    sentence = re.sub('\.', ' ', sentence) # remove full stops
    
    tokens = sentence.split()
    tokens = [t.lower() for t in tokens if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

In [5]:
# load the non-dreprecated java doc caveat sentences

def load_caveats():
    caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
    caveats_list = []

    files = glob.glob(caveat_files_dir + '*.json')
    for file in files:
        with open(file) as f:
            arr = ujson.load(f)
            full_class_name = os.path.splitext(os.path.basename(file))[0]
            simple_class_name = full_class_name.split('.')[-1]
            for caveat in arr:
                if not caveat['deprecated'] and 'name' in caveat:
                    for sentence in caveat['sentences']:
                        caveats_list.append({
                            'simple_class_name': simple_class_name,
                            'full_class_name': full_class_name,
                            'api': caveat['name'],
                            'sentence': sentence,
                            'type': 'body'
                        })

                    # add all misc level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': misc_sentence,
                                        'type': 'misc'
                                    })

                        else:
                            for s in misc_obj['list']:
                                caveats_list.append({
                                        'simple_class_name': simple_class_name,
                                        'full_class_name': full_class_name,
                                        'api': caveat['name'],
                                        'sentence': s,
                                        'type': 'misc'
                                    })

    return pd.DataFrame(caveats_list)

caveats = load_caveats()
print('Number of caveat sentences: {}'.format(len(caveats.index)))

Number of caveat sentences: 73831


In [8]:
caveats.head()

Unnamed: 0,api,full_class_name,sentence,simple_class_name,type
0,getBlockedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body
1,getBlockedTime,java.lang.management.ThreadInfo,This statistic is reset when the thread conten...,ThreadInfo,body
2,getBlockedTime,java.lang.management.ThreadInfo,the approximate accumulated elapsed time in mi...,ThreadInfo,misc
3,getBlockedTime,java.lang.management.ThreadInfo,if the Java virtual machine does not support t...,ThreadInfo,misc
4,getWaitedTime,java.lang.management.ThreadInfo,This method returns -1 if thread contention mo...,ThreadInfo,body


In [6]:
caveats['preprocessed'] = caveats['sentence'].apply(lambda x: preprocess(x))
caveats['tokens'] = caveats['preprocessed'].map(lambda x: tokenize(x))

In [8]:
# Concatenate text in the issue all comments for each issue
# issue_to_concat_text = {}
# for index, row in code_issues_df.iterrows():
#     assoc_comments_df = comments_df[comments_df['issue_id'] == row['id']]
    
#     concat_text = row['title'] + row['body']
    
#     for j, comment_row in assoc_comments_df.iterrows():
#         concat_text += row['body']
        
#     issue_to_concat_text[row['id']] = concat_text

In [14]:
# Identify the relevant issues for all caveats
# apis = set()
# for caveat in caveats_list:
#     apis.add((caveat['simple_class_name'], caveat['api']))
    
# print('Number of apis: {}'.format(len(apis)))
    
# def get_relevant_issues(api_tuple, issue_to_text_dict):
#     relevant = []
#     for id in issue_to_text_dict:
#         if api_tuple[0] in issue_to_text_dict[id]:
#             if (api_tuple[0] == api_tuple[1]) or api_tuple[1] in issue_to_text_dict[id]:
#                 relevant.append(id)
            
#     return relevant

# p = multiprocessing.Pool(2)
# relevant_issues = p.starmap(get_relevant_issues, zip(get_relevant_issues, repeat(issue_to_concat_text)))

Number of apis: 21942


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/thien/anaconda2/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlo

In [17]:
# with open('./output/relevant_issues_for_caveats.jsonl', 'w+') as f:
#     for i, api in enumerate(apis):
#         f.write(ujson.dumps({'class': api[0], 'api': api[1], 'issue_ids': relevant_issues[i]}) + '\n')

In [7]:
def tokenize_sentence_list(sent_list):
    return [tokenize(x) for x in sent_list]

def calculate_preprocessed_comment_sentences(df):
    df['preprocessed_comments'] = df['body'].map(lambda x: preprocess(x))
    print('Preprocessed all comments...')
    df['tokenised_para'] = df['preprocessed_comments'].map(lambda x: doc_tokenize(x))
    print('Tokenized all paragraphs...')
    df['sentences'] = df['preprocessed_comments'].map(lambda x: sent_tokenize(x))
    print('Completed sentence tokenization...')
    df['tokenised_sentences'] = df['sentences'].map(lambda x: tokenize_sentence_list(x))
    print('Tokenized all sentences...')
                
# with open('./output/issue-comment-sentences-2.txt', 'w+') as f, \
#     open('./output/associated-sentence-issue-ids-2.txt', 'w+') as f_2:

#     comment_sentences = []
#     sentences = ''
#     associated_issues = ''

#     for i in df.index:
#         issue_id = str(df.loc[i, 'issue_id'])

#         for tokens in df.loc[i, 'tokenised_sentences']:
#             sentence = ' '.join(tokens) # convert to gensim format for PathLineSentences
#             sentences += sentence + '\n'
#             associated_issues += issue_id + '\n'

#     f.write(sentences)
#     f_2.write(associated_issues)
            
calculate_preprocessed_comment_sentences(comments_df)

Preprocessed all comments...
Tokenized all paragraphs...
Completed sentence tokenization...
Tokenized all sentences...


In [63]:
cores = multiprocessing.cpu_count()

start = time.clock()
model = word2vec.Word2Vec(PathLineSentences('./output/issue-comment-sentences-2.txt'), size=100, window=5, min_count=5, workers=cores-1, iter=1, sg=1)
end =time.clock()
training_time=end-start
print('end training and cost ' + str(training_time)+ ' s')

end training and cost 450.77793999999994 s


In [64]:
# save word2vec model
model.save('./output/word2vec.model')
model.wv.save_word2vec_format('./output/word2vec.txt')

In [30]:
# calculate idf
idf = {}
with open('./output/issue-comment-sentences-2.txt') as f:
    start = time.clock()
    lines = f.readlines()
    print('Finished reading sentences from file...')    
    vocab = list(Word2Vec.load('./output/word2vec.model').wv.vocab.keys())
    N = len(lines)
    docs = [sentence.split() for sentence in lines]
    
    for doc in docs:
        for word in set(doc):
            if word not in idf:
                idf[word] = 1
            else:
                idf[word] += 1
    
    for word in idf:
        idf[word] = math.log(N / float(idf[word] + 1))
    
    end = time.clock()
    training_time=end-start
    print('IDF computation time: {}s'.format(training_time))

Finished reading sentences from file...
IDF computation time: 21.712850000000003s


In [148]:
s_avg = 0 # avg doc length
with open('./output/issue-comment-sentences-2.txt','r', encoding='utf-8') as f:
    lines = f.readlines()
    
    doc_lengths = [len(line.split()) for line in lines]
    s_avg = sum(doc_lengths) / len(doc_lengths)
    print("average document length: {}".format(s_avg)) 

# Calculate combination scores of word2vec and bm25
def bm25(doc, s2, idf):
    score = 0
    k1 = 1.5
    b = 0.75

    for w in doc:
        idf_s = idf.get(w, 1)
        bm25_ra = s2.count(w) * (k1 + 1)
        bm25_rb = s2.count(w) + k1 * (1 - b + b * len(s2) / s_avg)
        score += idf_s * (bm25_ra / bm25_rb)
    return score

def compute(s1, s2, voc):
    v2 = np.array([voc[s] for s in s2 if s in voc])
    v2 = v2.sum(axis=0)

    v1 = np.array([voc[s] for s in s1 if s in voc])
    v1 = v1.sum(axis=0)
    return 1 - spatial.distance.cosine(v1, v2)

def cosine(sentences, s2, voc):
    if not sentences:
        sentences = []
    s1_df_score = pd.Series(sentences)
    s1_df_score = s1_df_score.map(lambda x: compute(x, s2, voc))
    if type(s1_df_score.dropna()) != 'NoneType':
        if len(s1_df_score.dropna()) != 0:
            return s1_df_score.sort_values(ascending=False).head(1)
        
def load_voc(file_voc):
    vector_file = codecs.open(file_voc, 'r', encoding='utf-8')
    line = vector_file.readline()
    voc_size, vec_dim = map(int, line.split(' '))
    embedding = dict()
    line = vector_file.readline()
    while line:
        items = line.split(' ')
        item = items[0]
        vec = np.array(items[1:], dtype='float32')
        embedding[item] = vec
        line = vector_file.readline()
    return embedding

average document length: 11.103317980494701


In [140]:
vocab = load_voc('./output/word2vec.txt')
    
relevant_issues = {}
with open('./output/relevant_issues_for_caveats.jsonl') as f:
    for line in f:
        obj = ujson.loads(line)
        relevant_issues[(obj['class'], obj['api'])] = obj['issue_ids']

In [155]:
with open('/media/thien/Data Drive1/combined_sim_results.jsonl', 'w+') as f_out, \
    open('/media/thien/Data Drive1/combined_sim_results_errors.jsonl', 'w+') as f_err:
    
    for i in caveats.index:
        key = (caveats.loc[i,'simple_class_name'], caveats.loc[i,'api'])
        if key in relevant_issues:
            issue_ids = relevant_issues[key]
        
            if len(issue_ids) > 0:
                try:
                    caveat_sent = caveats.loc[i,'tokens']

                    # retrieve issue comment sentences that are relevant
                    relevant_comments_df = comments_df[comments_df['issue_id'].isin(issue_ids)]

                    sim_w2v = relevant_comments_df['tokenised_sentences'].apply(cosine, s2=caveat_sent, voc=vocab)
                    sim_bm25 = relevant_comments_df['tokenised_para'].apply(bm25, s2=caveat_sent, idf=idf)

                    sim_bm25 = (sim_bm25 - sim_bm25.min()) / (sim_bm25.max() - sim_bm25.min())
                    sim_bm25 = pd.to_numeric(sim_bm25, downcast='float')
                    sim_w2v = pd.to_numeric(sim_w2v[0], downcast='float')
                    
                    if len(sim_bm25) != 0 or len(sim_w2v) != 0:
                        combined_sim = 0.5 * sim_bm25.add(0.5 * sim_w2v, fill_value=0)

                        for j in combined_sim.index:
                            f_out.write(ujson.dumps({
                                'issue_number': int(relevant_comments_df.loc[j,'issue_id']),
                                'comment': relevant_comments_df.loc[j,'body'],
                                'caveat_sentence': caveats.loc[i,'sentence'],
                                'api': caveats.loc[i, 'api'],
                                'simple_class_name': caveats.loc[i,'simple_class_name'],
                                'full_class_name': caveats.loc[i, 'full_class_name'],
                                'html_url': relevant_comments_df.loc[j,'html_url'],
                                'score': str(combined_sim[j]),
                                'caveat_sentence_id': i
                            }) + '\n')

                except Exception as e:
                    f_err.write(ujson.dumps({'index': i, 'error': e}) + '\n')