In [20]:
import json
import ujson
import glob
import time
import multiprocessing
import re
import os
import time
from gensim.models import word2vec, Word2Vec
from gensim.models.word2vec import PathLineSentences
from itertools import repeat
from scipy import spatial
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
import codecs
from nltk.corpus import stopwords
import math
import functools
import operator
from random import sample

stopwords = set(stopwords.words('english'))

custom_stopwords = [s.split("'")[0] for s in stopwords]
custom_stopwords = [s for s in custom_stopwords if len(s) > 1]
print(custom_stopwords)

['through', 'shouldn', 'couldn', 'has', 'at', 'too', 'couldn', 'while', 'for', 'any', 'aren', 'an', 're', 'in', 'up', 'whom', 'as', 'yours', 'out', 'do', 'mustn', 'needn', 'myself', 've', 'didn', 'had', 'further', 'other', 'these', 'hasn', 'shan', 'hadn', 'over', 'but', 'just', 'until', 'are', 'which', 'above', 'ma', 'will', 'weren', 'because', 'his', 'won', 'if', 'during', 'doing', 'they', 'so', 'haven', 'mightn', 'itself', 'them', 'yourself', 'have', 'of', 'yourselves', 'this', 'before', 'below', 'more', 'there', 'under', 'did', 'we', 'the', 'wasn', 'few', 'mightn', 'what', 'hasn', 'should', 'most', 'didn', 'ours', 'and', 'our', 'himself', 'she', 'been', 'you', 'once', 'was', 'why', 'against', 'weren', 'theirs', 'their', 'such', 'after', 'its', 'now', 'won', 'when', 'shouldn', 'wouldn', 'to', 'you', 'shan', 'll', 'doesn', 'me', 'him', 'herself', 'nor', 'very', 'her', 'between', 'with', 'be', 'here', 'should', 'down', 'were', 'hadn', 'into', 'on', 'can', 'you', 'themselves', 'no', 'ar

In [5]:
# input paths
tfidf_path = './output/tfidf_results_new.jsonl'
w2v_path = './output/word2vec_results_new.jsonl'
bm25_path = './output/bm25_results_new.jsonl'
combo_path = './output/combined_sim_results_new.jsonl'

# output paths
tfidf_sample_path = './output/tfidf_sample.jsonl'
w2v_sample_path = './output/w2v_sample.jsonl'
bm25_sample_path = './output/bm25_sample.jsonl'
combo_sample_path = './output/combo_sample_path.jsonl'

In [13]:
def preprocess(doc):
    # remove all code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
            
    # remove urls
    doc = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', doc)
    
    # remove line break characters
    doc = re.sub(r'[\r\n]', ' ', doc)
    
    # remove apostrophes/suffixes
    doc = re.sub(r"'\w |\w' ", ' ', doc)
    
    # remove numbers
    doc = re.sub(r'(\d\.?)+', ' ', doc)
        
    # replace all punctuation except for full stop with space
    doc = re.sub(r'[^A-Za-z\.]', ' ', doc)
        
    # normalise full stops
    doc = re.sub(r'\s\.\.+', '.', doc)
    
    # remove more than 1 whitespace
    doc = re.sub('\s\s+', ' ', doc)
    
    # remove leading and trailing whitespace
    doc = doc.strip()
    
    return doc

def sent_tokenize(doc):
    docs = doc.split('. ')
    docs = [t for t in docs if t != '']
    return docs

def doc_tokenize(doc):
    doc = re.sub('\.', ' ', doc) # remove full stops
    
    tokens = [t.lower() for t in doc.split() if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

def tokenize(sentence):
    sentence = re.sub('\.', ' ', sentence) # remove full stops
    
    tokens = sentence.split()
    tokens = [t.lower() for t in tokens if len(t) > 1]
    tokens = [t for t in tokens if t not in custom_stopwords]
    return tokens

In [14]:
# load the extracted java-related GitHub data
def load_github_issues():
    with open('./output/issue-comments-revised.jsonl') as issue_comments_f:
        issue_list = []
        comments_list = []

        for line in issue_comments_f:
            obj = ujson.loads(line)

            comments_list.append({
                'body': obj['body'],
                'repo_name': obj['repo_name'],
                'html_url': obj['html_url'],
                'issue_id': obj['issue']['id']
            })

            issue = obj['issue']
            issue['repo_name'] = obj['repo_name']
            issue_list.append(issue)

        issues_df = pd.DataFrame(issue_list)
        issues_df = issues_df.drop_duplicates(subset=['id'])
        comments_df = pd.DataFrame(comments_list)
        
        return (issues_df, comments_df)


issues_df, comments_df = load_github_issues()
print("Number of issues: {}".format(len(issues_df.index)))
print("Number of comments: {}".format(len(comments_df.index)))

Number of issues: 627450
Number of comments: 1855870


In [15]:
code_issues_df = issues_df.dropna(subset=['body'])
code_issues_df = code_issues_df[code_issues_df['body'].str.contains('```')]
block_comments_df = comments_df[comments_df['issue_id'].isin(code_issues_df['id'])].copy()

In [16]:
def tokenize_sentence_list(sent_list):
    return [tokenize(x) for x in sent_list]

def calculate_preprocessed_comment_sentences(df):
    df['preprocessed_comments'] = df['body'].map(lambda x: preprocess(x))
    print('Preprocessed all comments...')
    
    df['tokenised_para'] = df['preprocessed_comments'].map(lambda x: doc_tokenize(x))
    print('Tokenized all paragraphs...')
    
    df['sentences'] = df['preprocessed_comments'].map(lambda x: sent_tokenize(x))
    print('Completed sentence tokenization...')
    
    df['tokenised_sentences'] = df['sentences'].map(lambda x: tokenize_sentence_list(x))
    print('Tokenized all sentences...')
                            
calculate_preprocessed_comment_sentences(block_comments_df)

Preprocessed all comments...
Tokenized all paragraphs...
Completed sentence tokenization...
Tokenized all sentences...


In [17]:
# Get all comments that contain non-empty preprocessed sentences
preprocessed_comments_df = block_comments_df[block_comments_df.astype(str)['tokenised_sentences'] != '[]']

In [64]:
issues_df.head()

Unnamed: 0,body,html_url,id,number,repo_name,title
0,Elasticsearch 5.4.1\r\nRollover API problem\r\...,https://github.com/elastic/elasticsearch/issue...,264716524,26976,elastic/elasticsearch,Alias [test-schema-active-logs] has more than ...
1,A snackbar shows and then gets hidden by the b...,https://github.com/ritvikkar/MovieFinder/issue...,285146015,43,ritvikkar/MovieFinder,We need to make offline mode look better
2,https://github.com/jooby-project/jooby/blob/ma...,https://github.com/jooby-project/jooby/issues/965,284439800,965,jooby-project/jooby,2nd thymeleaf code snippet (in the documentati...
3,When updating Spotless from `3.6.0` to `3.7.0`...,https://github.com/diffplug/spotless/issues/182,285279535,182,diffplug/spotless,Unable to store input properties... when upgra...
4,https://pastebin.com/bT3JGcjN,https://github.com/Alex-the-666/Ice_and_Fire/i...,258569936,321,Alex-the-666/Ice_and_Fire,Hippogryph loot table might be broken


In [63]:
preprocessed_comments_df.head()

Unnamed: 0,body,html_url,issue_id,repo_name,preprocessed_comments,tokenised_para,sentences,tokenised_sentences
2,Just an example to see how to use the `thymel...,https://github.com/jooby-project/jooby/issues/...,284439800,jooby-project/jooby,Just an example to see how to use the thymelea...,"[example, see, use, thymeleaf, api, need, test...",[Just an example to see how to use the thymele...,"[[example, see, use, thymeleaf, api, need, tes..."
3,@nedtwigg Would it be practical for you to upg...,https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,nedtwigg Would it be practical for you to upgr...,"[nedtwigg, would, practical, upgrade, propriet...",[nedtwigg Would it be practical for you to upg...,"[[nedtwigg, would, practical, upgrade, proprie..."
7,"Nope - 4.4.1 brings a different, unrelated pro...",https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,Nope brings a different unrelated problem,"[nope, brings, different, unrelated, problem]",[Nope brings a different unrelated problem],"[[nope, brings, different, unrelated, problem]]"
10,`Objects.requireNonNull` should also be suppor...,https://github.com/uber/NullAway/issues/47#iss...,268767363,uber/NullAway,Objects.requireNonNull should also be supported.,"[objects, requirenonnull, also, supported]",[Objects.requireNonNull should also be support...,"[[objects, requirenonnull, also, supported]]"
14,"Darn, that's a shame. :<\r\n\r\nNothing comes ...",https://github.com/diffplug/spotless/issues/18...,285279535,diffplug/spotless,Darn that a shame. Nothing comes to mind yet b...,"[darn, shame, nothing, comes, mind, yet, might...","[Darn that a shame, Nothing comes to mind yet ...","[[darn, shame], [nothing, comes, mind, yet, mi..."


In [18]:
sentence_index_to_df_index = {}
with open('./output/associated-sentence-df-index-new.txt') as f:
    indices = f.readlines()
    indices = [int(x) for x in indices]

    for index, df_index in enumerate(indices):
        sentence_index_to_df_index[index] = df_index

In [27]:
# TF-IDF
with open(tfidf_path) as f, open(tfidf_sample_path, 'w+') as f_out:
    results = []
    c = 0
    for line in f:
        obj = ujson.loads(line)
        obj['tfidf_sim_scores'] = \
            sorted(obj['tfidf_sim_scores'], key=lambda x: x['score'], reverse=True)
        c += len(obj['tfidf_sim_scores'])
        obj['tfidf_sim_scores'] = obj['tfidf_sim_scores'][:3] # limit to 3 results per caveat
        results.append(obj)

    print('Number of results: {}'.format(len(results)))
    print('Total number of scores: {}'.format(c))
    to_label = sample(results, 384)
    
    out_str = ''
    for obj in to_label:
        for res in obj['tfidf_sim_scores']:
            row = preprocessed_comments_df.loc[sentence_index_to_df_index[res['sentence_index']], :]
            out_str += ujson.dumps({
                'score' : res['score'],
                'caveat': obj['caveat_sentence'],
                'api': obj['api'],
                'class': obj['simple_class_name'],
                'comment': row['body'],
                'html_url': row['html_url']
            }) + '\n'
            
    f_out.write(out_str.strip())

Number of results: 19101
Total number of scores: 2430521


In [32]:
# W2V
with open(w2v_path) as f, open(w2v_sample_path, 'w+') as f_out:
    results = []
    out_str = ''
    c = 0
    
    for line in f:
        obj = ujson.loads(line)
        obj['w2v_results'] = [x for x in obj['w2v_results'] if x['score'] != 'nan']
        c += len(obj['w2v_results'])
        
        for score in obj['w2v_results']:
            score['score'] = float(score['score'])
        
        obj['w2v_results'] = sorted(obj['w2v_results'], key=lambda x: x['score'], reverse=True)
        obj['w2v_results'] = obj['w2v_results'][:3] # limit to 3 results per caveat
        results.append(obj)
    
    print('Number of results: {}'.format(len(results)))
    print('Total number of scores: {}'.format(c))
    to_label = sample(results, 384)
    
    for obj in to_label:
        for res in obj['w2v_results']:
            issue = issues_df[issues_df['id'] == res['issue_number']]
            out_str += ujson.dumps({
                'score': res['score'],
                'comment': res['comment'],
                'class': obj['simple_class_name'],
                'api': obj['api'],
                'caveat': obj['caveat_sentence'],
                'html_url': issue.iloc[0]['html_url']
            }) + '\n'
            
    f_out.write(out_str.strip())
        

Number of results: 2000
Total number of scores: 433237


In [33]:
# BM25
with open(bm25_path) as f, open(bm25_sample_path, 'w+') as f_out:
    results = []
    out_str = ''
    c = 0

    for line in f:
        obj = ujson.loads(line)
        obj['bm25_results'] = [x for x in obj['bm25_results'] if x['score'] != 'nan']
        c += len(obj['bm25_results'])
        
        for score in obj['bm25_results']:
            score['score'] = float(score['score'])
        obj['bm25_results'] = sorted(obj['bm25_results'], key=lambda x: x['score'], reverse=True)
        obj['bm25_results'] = obj['bm25_results'][:3] # limit to 3 results per caveat
        results.append(obj)
    
    print('Number of results: {}'.format(len(results)))
    print('Total number of scores: {}'.format(c))
    to_label = sample(results, 384)
    
    for obj in to_label:
        for res in obj['bm25_results']:
            issue = issues_df[issues_df['id'] == res['issue_number']]
            out_str += ujson.dumps({
                'score': res['score'],
                'comment': res['comment'],
                'class': obj['simple_class_name'],
                'api': obj['api'],
                'caveat': obj['caveat_sentence'],
                'html_url': issue.iloc[0]['html_url']
            }) + '\n'
            
    f_out.write(out_str.strip())

Number of results: 2000
Total number of scores: 677202


In [34]:
# W2V + BM25 combination
with open(combo_path) as f, open(combo_sample_path, 'w+') as f_out:
    results = []
    out_str = ''
    c = 0

    for line in f:
        obj = ujson.loads(line)
        obj['combination_results'] = [x for x in obj['combination_results'] if x['score'] != 'nan']
        c += len(obj['combination_results'])   
    
        for score in obj['combination_results']:
            score['score'] = float(score['score'])
            
        obj['combination_results'] = \
            sorted(obj['combination_results'], key=lambda x: x['score'], reverse=True)
    
        obj['combination_results'] = obj['combination_results'][:3] # limit to 3 results per caveat
        results.append(obj)
    
    print('Number of results: {}'.format(len(results)))
    print('Total number of scores: {}'.format(c))
    to_label = sample(results, 384)
    
    for obj in to_label:
        for res in obj['combination_results']:
            issue = issues_df[issues_df['id'] == res['issue_number']]
            out_str += ujson.dumps({
                'score': res['score'],
                'comment': res['comment'],
                'class': obj['simple_class_name'],
                'api': obj['api'],
                'caveat': obj['caveat_sentence'],
                'html_url': issue.iloc[0]['html_url']
            }) + '\n'
            
    f_out.write(out_str.strip())

Number of results: 2000
Total number of scores: 681187


In [37]:
# convert to doccano format for labelling
def convert_to_doccano(path, output_path):
    with open(path) as f, open(output_path, 'w+') as f_out:
        for line in f:
            obj = ujson.loads(line)
            obj['labels'] = ['not-relevant']
            obj['text'] = 'Class: ' + obj['class'] + '\nAPI: ' + obj['api'] + '\n--------\nCaveat: ' \
                + obj['caveat'] + '\n--------\nComment: ' + re.sub(r'```([^```]*)```', '', obj['comment'])
            f_out.write(ujson.dumps(obj) + '\n')

convert_to_doccano(tfidf_sample_path, './output/tfidf_to_label.jsonl')
convert_to_doccano(w2v_sample_path, './output/w2v_to_label.jsonl')
convert_to_doccano(bm25_sample_path, './output/bm25_to_label.jsonl')
convert_to_doccano(combo_sample_path, './output/combo_to_label.jsonl')

In [54]:
def write_modified_labelled_doccano(labelled_path, metadata_path, original_path, output_path):
    with open(labelled_path) as f, open(original_path) as f2, open(metadata_path) as f3:
        labelled_objs = []
        label_id_to_name = {}
        
        metadata = ujson.load(f3)
        for obj in metadata:
            label_id_to_name[obj['id']] = obj['text']
        
        for line in f:
            labelled_objs.append(ujson.loads(line))
        
        with open(output_path, 'w+') as f_out:
            for i, line in enumerate(f2):
                obj = ujson.loads(line)
                del obj['labels']
                obj['label'] = label_id_to_name[labelled_objs[i]['annotations'][0]['label']]
                
                f_out.write(ujson.dumps(obj) + '\n')

In [55]:
write_modified_labelled_doccano('./output/labelled_combo.jsonl',
                               './output/combo_metadata.json',
                               './output/combo_to_label.jsonl',
                               './labelled_data/combo.jsonl')