In [1]:
import json
import nltk
import glob
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

caveat_files_dir = '/media/thien/Data Drive1/api_12_caveat_sentences/'

In [2]:
# load the extracted java-related GitHub data
issues = []
issue_comments = []
pull_requests = []
pull_request_comments = []

with open('./output/java-2018-issues.json') as issues_f, \
    open('./output/java-2018-issue-comments.json') as issue_comments_f,\
    open('./output/java-2018-pull-requests.json') as pull_requests_f,\
    open('./output/java-2018-pull-request-comments.json') as pull_request_comments_f:
    issues = json.load(issues_f)
    issue_comments = json.load(issue_comments_f)
    pull_requests = json.load(pull_requests_f)
    pull_request_comments = json.load(pull_request_comments_f)

caveats_dict = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = json.load(f)
        name = os.path.basename(file)
        
        caveats_dict[name] = arr

In [3]:
# Print number of GitHub data
print('Number of issues: {}'.format(len(issues)))
print('Number of issue comments: {}'.format(len(issue_comments)))
print('Number of pull requests: {}'.format(len(pull_requests)))
print('Number of pull request comments: {}'.format(len(pull_request_comments)))

Number of issues: 427886
Number of issue comments: 1659881
Number of pull requests: 413899
Number of pull request comments: 551345


In [4]:
def preprocess(doc):
    if not doc:
        return []
    
    # remove leading/ending white spaces
    doc = doc.strip()
        
    # remove all inline code or code blocks
    doc = re.sub(r'```([^```]*)```', '', doc)
    doc = re.sub(r'`([^`]*)`', '', doc)
    
    # remove more than 1 whitespace
    doc = re.sub(' +', ' ', doc)
    
    # lowercase
    doc = doc.lower()

    return sent_tokenize(doc)

In [5]:
def tokenize(sentence):   
    # remove url links
    sentence = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',             
        '', sentence)
    
    # remove paths
    sentence = re.sub(r'/[^/]*(/[^/]*)*/?', '', sentence)
    
    # remove punctuation except for single space
    sentence = re.sub(r'[^A-Za-z_ ]', '', sentence)
         
    # return a list of the stemmed, tokenized words that are not stop words
    return [stemmer.stem(i) for i in word_tokenize(sentence) if i not in stop_words]
    

In [46]:
sentences = []
for obj in issues:
    if obj['body']:
        for sentence in preprocess(obj['body']):
            sentences.append(tokenize(sentence))

In [83]:
print(len(sentences))

cv = CountVectorizer(
    lowercase=False,
    tokenizer=lambda x: x,
    binary=True,
    min_df=0.001
)  

x = cv.fit_transform(sentences)

1058658


In [84]:
print(cv.get_feature_names()[:100])
print(len(cv.get_feature_names()))

['_', '_at', 'abil', 'abl', 'accept', 'access', 'accord', 'account', 'achiev', 'across', 'action', 'activ', 'actual', 'ad', 'adapt', 'add', 'addit', 'addon', 'address', 'adjust', 'admin', 'advanc', 'affect', 'agent', 'aggreg', 'ago', 'algorithm', 'allow', 'almost', 'along', 'alreadi', 'also', 'altern', 'although', 'altscreen', 'alway', 'amount', 'analysi', 'android', 'anim', 'annot', 'anoth', 'answer', 'anymor', 'anyon', 'anyth', 'anyway', 'apach', 'api', 'apk', 'app', 'appear', 'appli', 'applic', 'appreci', 'approach', 'appropri', 'area', 'arent', 'argument', 'around', 'array', 'artifact', 'ask', 'assign', 'assigne', 'associ', 'assum', 'attach', 'attack', 'attempt', 'attribut', 'audio', 'authent', 'author', 'auto', 'autom', 'automat', 'avail', 'avoid', 'aw', 'away', 'awesom', 'b', 'back', 'backend', 'background', 'backup', 'bad', 'bar', 'base', 'basic', 'bazel', 'bean', 'becom', 'begin', 'behavior', 'behaviorth', 'behaviour', 'behind']
1154


In [113]:
c = 0
for obj in issue_comments:
    if 'body' in obj and obj['body'] and 'BufferedReader' in obj['body']:
        c += 1
        
print(c)

147
