In [55]:
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize

In [120]:
bugReport = "It seems like the Eclipse Java compiler fails to compile for nested inheritance. \
Best way to replicate it is to clone the Eclipse Collections Project and run a maven clean install .\
Make sure to skip tests for early feedback. The clean install using native javac will go through fine, \
however, if you try to compile the project using the Eclipse IDE then the same compilation fails. \
This issue is either in the javac or it is in the Eclipse Java compiler. Can you please take a look?\
The fix is simple to essentially add a cast explicitly, but having unnecessary casts is something we should not do."

In [121]:
def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text)
    text = "".join(list(text))
    return text

text = clean(bugReport)
text = word_tokenize(text)
print(text)

['it', 'seems', 'like', 'the', 'eclipse', 'java', 'compiler', 'fails', 'to', 'compile', 'for', 'nested', 'inheritance', '.', 'best', 'way', 'to', 'replicate', 'it', 'is', 'to', 'clone', 'the', 'eclipse', 'collections', 'project', 'and', 'run', 'a', 'maven', 'clean', 'install', '.make', 'sure', 'to', 'skip', 'tests', 'for', 'early', 'feedback', '.', 'the', 'clean', 'install', 'using', 'native', 'javac', 'will', 'go', 'through', 'fine', ',', 'however', ',', 'if', 'you', 'try', 'to', 'compile', 'the', 'project', 'using', 'the', 'eclipse', 'ide', 'then', 'the', 'same', 'compilation', 'fails', '.', 'this', 'issue', 'is', 'either', 'in', 'the', 'javac', 'or', 'it', 'is', 'in', 'the', 'eclipse', 'java', 'compiler', '.', 'can', 'you', 'please', 'take', 'a', 'look', '?', 'the', 'fix', 'is', 'simple', 'to', 'essentially', 'add', 'a', 'cast', 'explicitly', ',', 'but', 'having', 'unnecessary', 'casts', 'is', 'something', 'we', 'should', 'not', 'do', '.']


In [122]:
# POS tagging for later filtering
POS_tagging = nltk.pos_tag(text)
print (POS_tagging)

[('it', 'PRP'), ('seems', 'VBZ'), ('like', 'IN'), ('the', 'DT'), ('eclipse', 'NN'), ('java', 'NN'), ('compiler', 'NN'), ('fails', 'VBZ'), ('to', 'TO'), ('compile', 'VB'), ('for', 'IN'), ('nested', 'JJ'), ('inheritance', 'NN'), ('.', '.'), ('best', 'JJS'), ('way', 'NN'), ('to', 'TO'), ('replicate', 'VB'), ('it', 'PRP'), ('is', 'VBZ'), ('to', 'TO'), ('clone', 'VB'), ('the', 'DT'), ('eclipse', 'NN'), ('collections', 'NNS'), ('project', 'NN'), ('and', 'CC'), ('run', 'VB'), ('a', 'DT'), ('maven', 'JJ'), ('clean', 'JJ'), ('install', 'NN'), ('.make', 'VBP'), ('sure', 'JJ'), ('to', 'TO'), ('skip', 'VB'), ('tests', 'NNS'), ('for', 'IN'), ('early', 'JJ'), ('feedback', 'NN'), ('.', '.'), ('the', 'DT'), ('clean', 'JJ'), ('install', 'NN'), ('using', 'VBG'), ('native', 'JJ'), ('javac', 'NN'), ('will', 'MD'), ('go', 'VB'), ('through', 'IN'), ('fine', 'NN'), (',', ','), ('however', 'RB'), (',', ','), ('if', 'IN'), ('you', 'PRP'), ('try', 'VBP'), ('to', 'TO'), ('compile', 'VB'), ('the', 'DT'), ('projec

In [123]:
porter_stemmer = PorterStemmer()
stemmed_text = []

for word in POS_tagging:
    stemmed_text.append(str(porter_stemmer.stem(word[0])))

print (stemmed_text)

['it', 'seem', 'like', 'the', 'eclips', 'java', 'compil', 'fail', 'to', 'compil', 'for', 'nest', 'inherit', '.', 'best', 'way', 'to', 'replic', 'it', 'is', 'to', 'clone', 'the', 'eclips', 'collect', 'project', 'and', 'run', 'a', 'maven', 'clean', 'instal', '.make', 'sure', 'to', 'skip', 'test', 'for', 'earli', 'feedback', '.', 'the', 'clean', 'instal', 'use', 'nativ', 'javac', 'will', 'go', 'through', 'fine', ',', 'howev', ',', 'if', 'you', 'tri', 'to', 'compil', 'the', 'project', 'use', 'the', 'eclips', 'ide', 'then', 'the', 'same', 'compil', 'fail', '.', 'thi', 'issu', 'is', 'either', 'in', 'the', 'javac', 'or', 'it', 'is', 'in', 'the', 'eclips', 'java', 'compil', '.', 'can', 'you', 'pleas', 'take', 'a', 'look', '?', 'the', 'fix', 'is', 'simpl', 'to', 'essenti', 'add', 'a', 'cast', 'explicitli', ',', 'but', 'have', 'unnecessari', 'cast', 'is', 'someth', 'we', 'should', 'not', 'do', '.']


In [124]:
stopword = []

# words from different parts of speech are provided with these major tags - nouns,verbs and adjectives
POS_tags = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

# adding to the stop words list - words that do not belong to the above mentioned tags
for word in POS_tagging:
    if word[1] not in POS_tags:
        stopword.append(word[0])
        
# getting all the punctuations
punctuations = list(str(string.punctuation))

# adding all the punctuations to the stop word list for filtering the text
stopword = stopword + punctuations

In [125]:
# getting stopwords from the nltk library
library_stopwords = list(set(stopwords.words('english')))

# adding the library stop words to the main stop word list
stopwordList = stopword + punctuations + library_stopwords
stopwordList = set(stopwordList)

In [126]:
# filtering the text from the stopwordlist
preprocessedText = []
for word in stemmed_text:
    if word not in stopwordList:
        preprocessedText.append(word)
print (preprocessedText)

['seem', 'eclips', 'java', 'compil', 'fail', 'compil', 'nest', 'inherit', 'best', 'way', 'replic', 'eclips', 'collect', 'project', 'maven', 'clean', 'instal', 'sure', 'test', 'earli', 'feedback', 'clean', 'instal', 'use', 'nativ', 'javac', 'fine', 'howev', 'tri', 'compil', 'project', 'use', 'eclips', 'ide', 'compil', 'fail', 'thi', 'issu', 'javac', 'eclips', 'java', 'compil', 'pleas', 'look', 'fix', 'simpl', 'essenti', 'cast', 'explicitli', 'unnecessari', 'cast', 'someth']


In [127]:
vocabulary = list(set(preprocessedText))
print (vocabulary)

['compil', 'inherit', 'howev', 'look', 'essenti', 'earli', 'unnecessari', 'fix', 'seem', 'collect', 'nativ', 'pleas', 'fine', 'fail', 'best', 'maven', 'way', 'project', 'sure', 'clean', 'feedback', 'ide', 'simpl', 'explicitli', 'eclips', 'tri', 'test', 'java', 'cast', 'someth', 'use', 'replic', 'instal', 'thi', 'issu', 'nest', 'javac']


In [131]:
vocab_len = len(vocabulary)

# default weight 0.25
weighted_edge = np.full((vocab_len,vocab_len), 0.25)

score = np.zeros((vocab_len),dtype=np.float32)

window_size = 2
coocurrences = []

In [134]:
# Building graph using term co-occurrence into connecting edges
for i in range(0,vocab_len):    
    score[i]=0.25    
    for j in range(0,vocab_len):    
        if j==i:
            weighted_edge[i][j]=0.25 
        else:
            for start in range(0,(len(preprocessedText)-window_size)): 
                end = start + window_size
                window = preprocessedText[start:end]
                if (vocabulary[i] in window) and (vocabulary[j] in window):
                    index_i = start + window.index(vocabulary[i])
                    index_j = start + window.index(vocabulary[j])
                    if [index_of_i,index_of_j] not in coocurrences:
                        weighted_edge[i][j]+=1/math.fabs(index_i-index_j)
                        coocurrences.append([index_i,index_j])

In [135]:
# out denotes the vertices to which Vj is connected through outgoing links
out = np.zeros((vocab_len),dtype=np.float32)
for i in range(0,vocab_len):
    for j in range(0,vocab_len):
        out[i]+=weighted_edge[i][j]

In [136]:
MAX_ITERATIONS = 100
d=0.85 #damping factor
threshold = 0.0001 #heuristic threshold for convergence checking

#calculating the rank based on the formula
for itr in range(0,MAX_ITERATIONS):
    prev_score = np.copy(score)   
    for i in range(0,vocab_len):      
        final_weight = 0
        for j in range(0,vocab_len):
            if weighted_edge[i][j] != 0.25:
                final_weight += (weighted_edge[i][j]/out[j])*score[j]            
        score[i] = (1-d) + d*(final_weight)    
        #convergence condition
    if np.sum(np.fabs(prev_score-score)) <= threshold:
        break

In [137]:
for i in range(0,vocab_len):
    print("Score of "+vocabulary[i]+": "+str(score[i]))

Score of compil: 0.31237847
Score of inherit: 0.18491015
Score of howev: 0.18492743
Score of look: 0.18491009
Score of essenti: 0.18502384
Score of earli: 0.18491755
Score of unnecessari: 0.18503258
Score of fix: 0.1849303
Score of seem: 0.16760398
Score of collect: 0.18523373
Score of nativ: 0.18529461
Score of pleas: 0.1847054
Score of fine: 0.18511216
Score of fail: 0.2122533
Score of best: 0.18493074
Score of maven: 0.18496194
Score of way: 0.18494461
Score of project: 0.21985242
Score of sure: 0.18480936
Score of clean: 0.2161423
Score of feedback: 0.1847966
Score of ide: 0.18484564
Score of simpl: 0.18494011
Score of explicitli: 0.18503258
Score of eclips: 0.2858034
Score of tri: 0.18470703
Score of test: 0.18491866
Score of java: 0.2127219
Score of cast: 0.20242508
Score of someth: 0.15
Score of use: 0.22007836
Score of replic: 0.18507083
Score of instal: 0.2162999
Score of thi: 0.18449932
Score of issu: 0.18507174
Score of nest: 0.1847054
Score of javac: 0.2200657
