In [0]:
pip install wikipedia

In [0]:
pip install pyspellchecker

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
import re
import wikipedia
import pandas as pd
import datetime
# import the math module  
import math
import nltk
from textblob import TextBlob
nltk.download('brown')

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

data = pd.read_csv('mono.csv', encoding='latin-1')

In [0]:
# tensroflow hub module for Universal sentence Encoder 
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)
stop_words = set(stopwords.words('english')) 

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text)
    #text = re.sub(r'`\S+', '', text)
    #text = re.sub(r'.\S+', ' ', text)
    text = text.strip() #Remove white space from beginning and ending
    return text

def camel_case_split(tokens): 
   words = []
   word_tokens = word_tokenize(tokens)
   for token in word_tokens:
        words = [[token[0]]]   
        for c in tokens[1:]: 
            if words[-1][-1].islower() and c.isupper(): 
                words.append(list(c)) 
            else: 
                words[-1].append(c)   
   return words

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    word_tokens = word_tokenize(tokens) 
    for token in word_tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)  
    return lemma_list

def stemm(tokens):  
  ps = PorterStemmer()
  stem_list = []
  word_tokens = word_tokenize(tokens) 
  for w in word_tokens:
      rootWord = ps.stem(w)
      stem_list.append(rootWord)
  return stem_list

def correct_spelling(tokens): 
    #print(tokens)
    spell = SpellChecker()
    spellchecked_list = []
    # find those words that may be misspelled
    word_tokens = word_tokenize(tokens) 
    for w in word_tokens:
      alist = []
      alist.append(w)
      if len(spell.unknown(alist)) == 0:
         spellchecked_list.append(w)
      else:
         spellchecked_list.append(spell.correction(w))
    #print(spellchecked_list)
    return spellchecked_list
    
def wikisuggestion(token):
    result = []
    word_tokens = word_tokenize(token) 
    for w in word_tokens:      
        spell = SpellChecker()
        wiki_list = wikipedia.search(w)
        if len(wiki_list) == 0:  # No Suggested Word from Wiki, Correct Spelling with Python Spelling Checker
            result.append(spell.correction(w))
        else:
            for wl in wiki_list:
                if wl in data: # Suggested the closest word based on the context. 
                  result.append(wl)
                  break
                else:
                  result.append(spell.correction(w))
                  break
    return result

def extract_Noun(text):
  blob = TextBlob(text)
  return blob.noun_phrases

def dot_replace(text):
  return text.replace(".", " ")

def process_all(text):
    text = process_text(text)
    text = ' '.join(remove_stopwords(stop_words, text.split()))
    #text = ' '.join(camel_case_split(text))
    #text = ' '.join(correct_spelling(text))
    text = ' '.join(extract_Noun(text))
    text = ' '.join(stemm(text))
    text = ' '.join(lemmatize(text))  
    return text

def unique_words(sentence):
    return set(sentence.lower().split())

def feature_names(data):
    uniquewords= []
    for s in data: 
       words = unique_words(s)
       for w in words:
         if w not in uniquewords: 
           uniquewords.append(w)
    return uniquewords

def estimate_clusters(data):
   #totalyear = len(pd.to_datetime(data['CreatedDate']).dt.strftime("%y").drop_duplicates().tolist())
   totalissuescount = len(data)
   #value = (totalissuescount/totalyear)/12
   value = (totalissuescount/2)
   if(value > 1): 
     return math.sqrt(value)
   else: 
     return 2

def evaluate_clusters(cluster_result_df):
   TP = 0
   TN = 0
   FP = 0
   FN = 0
   for ind in cluster_result_df.index:
      filter1 = cluster_result_df['PullRequestID'] == cluster_result_df['PullRequestID'][ind] #Fixed By Same Pull Request (Class)
      filter2 = cluster_result_df['IssueID'] != cluster_result_df['IssueID'][ind] #Different Issue
      filter3 = cluster_result_df['Cluster'] != cluster_result_df['Cluster'][ind] #Different Cluster
      filter4 = cluster_result_df['Cluster'] == cluster_result_df['Cluster'][ind] #Same Cluster
      filter5 = cluster_result_df['PullRequestID'] != cluster_result_df['PullRequestID'][ind] #Fixed By Different Pull Request (Class)
      filter6 = cluster_result_df['FixedByID'] != cluster_result_df['FixedByID'][ind] #Fixed By Same Developer (Class)
      similarIssues = cluster_result_df.where(filter1 & filter2)
      if(len(similarIssues.dropna()) > 0):    
        if(len(cluster_result_df.where(filter2 & filter4 & filter5 & filter6).dropna()) > 0): #Get List of Issues on the same cluster with different Pull Request     
              FP = FP + 1  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : FP")
        elif(len(cluster_result_df.where(filter1 & filter2 & filter3).dropna()) > 0 ): #Get List of Issue with Same Pull Requests (Class) but Different Cluster 
              FN = FN + 1  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : FN" )
        elif(len(cluster_result_df.where(filter4 & filter5 & filter6).dropna()) == 0): #Get the List issues same cluster with Different Pull Request
              TP = TP + 1 
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two: TP" )  
        else:
              TN = TN + 1 
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : TN")       
      else: 
              TP = TP + 1 # Single Issue Fixed By Pull Request  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two: TP") 
   print("Total :", "TP :", TP, "TN :", TN, "FP :", FP, "FN :", FN) 
   return TP,FP,FN

def calculate_results(true_positive, false_positive, false_negative):
    if true_positive + false_positive > 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 0
    if true_positive + false_negative > 0:
       recall = true_positive / (true_positive + false_negative)
    else:
         recall = 0
    if precision + recall > 0:
         f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return precision, recall, f1

In [0]:
data_processed = list(map(process_all, list(data['Title'])))
BASE_VECTORS = get_features(data_processed)

#true_k = int(estimate_clusters(data)) 
true_k = len(data["PullRequestID"].unique())
model = KMeans(n_clusters=true_k, init='random', max_iter=true_k*2, n_init=1)
model.fit(BASE_VECTORS)


print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = feature_names(data_processed)

#print assign vector
i = 0
cluster_result = []
for l in model.labels_:
    #print(l, " : ", data_processed[i], data['PullRequestID'][i])  
    newrow = {'Cluster': l ,'PullRequestID': data['PullRequestID'][i],'IssueID': data['IssueID'][i],'FixedByID': data['FixedByID'][i] }  
    cluster_result.append(newrow) 
   
    #append row to the dataframe
    i = i +  1

cluster_result_df = pd.DataFrame(cluster_result)

### Save Cluster Result to CSV
cluster_result_df.to_csv('out.csv', index=False)

In [0]:
 filter1 = cluster_result_df['PullRequestID'] == cluster_result_df['PullRequestID'][1] #Fixed By Same Pull Request (Class)
 filter2 = cluster_result_df['IssueID'] != cluster_result_df['IssueID'][1] #Different Issue
 filter3 = cluster_result_df['Cluster'] != cluster_result_df['Cluster'][1] #Different Cluster
 filter4 = cluster_result_df['Cluster'] == cluster_result_df['Cluster'][1] #Same Cluster
 filter5 = cluster_result_df['PullRequestID'] != cluster_result_df['PullRequestID'][1] #Fixed By Different Pull Request (Class)
 filter6 = cluster_result_df['FixedByID'] != cluster_result_df['FixedByID'][1] #Fixed By Same Developer (Class)
 #len(cluster_result_df.where(filter2 & filter4).dropna())
 #len(cluster_result_df.where(filter2 & filter4 & filter5 & filter6).dropna())
 cluster_result_df.where(filter2 & filter4 & (filter5 |filter6)).dropna()


In [49]:
### Evaluate cluster 
TP, TN, FN = evaluate_clusters(cluster_result_df)
P, R, F1 = calculate_results(TP, TN, FN)

Cluster:  23 PullRequestID:  18836 Issue:  16741 Two: TP
Cluster:  5 PullRequestID:  18539 Issue:  18467 Two : FP
Cluster:  2 PullRequestID:  18539 Issue:  16648 Two : FP
Cluster:  10 PullRequestID:  18529 Issue:  18455 Two : FP
Cluster:  10 PullRequestID:  18529 Issue:  18496 Two : FP
Cluster:  2 PullRequestID:  18459 Issue:  18388 Two : FP
Cluster:  26 PullRequestID:  18459 Issue:  18457 Two : FN
Cluster:  2 PullRequestID:  18458 Issue:  18388 Two : FP
Cluster:  26 PullRequestID:  18458 Issue:  18457 Two : FN
Cluster:  22 PullRequestID:  18390 Issue:  18180 Two : FN
Cluster:  16 PullRequestID:  18390 Issue:  18385 Two : FN
Cluster:  25 PullRequestID:  16309 Issue:  5 Two: TP
Cluster:  24 PullRequestID:  15043 Issue:  14970 Two : FP
Cluster:  21 PullRequestID:  15043 Issue:  12577 Two : FN
Cluster:  11 PullRequestID:  14969 Issue:  7377 Two : FP
Cluster:  12 PullRequestID:  14969 Issue:  14773 Two : FP
Cluster:  1 PullRequestID:  14967 Issue:  13901 Two : FP
Cluster:  24 PullRequestID

In [50]:
print(P, R, F1)

0.11904761904761904 0.2631578947368421 0.1639344262295082


In [0]:
def evaluate_clusters(cluster_result_df):
   TP = 0
   TN = 0
   FP = 0
   FN = 0
   for ind in cluster_result_df.index:
      filter1 = cluster_result_df['PullRequestID'] == cluster_result_df['PullRequestID'][ind] #Fixed By Same Pull Request (Class)
      filter2 = cluster_result_df['IssueID'] != cluster_result_df['IssueID'][ind] #Different Issue
      filter3 = cluster_result_df['Cluster'] != cluster_result_df['Cluster'][ind] #Different Cluster
      filter4 = cluster_result_df['Cluster'] == cluster_result_df['Cluster'][ind] #Same Cluster
      filter5 = cluster_result_df['PullRequestID'] != cluster_result_df['PullRequestID'][ind] #Fixed By Different Pull Request (Class)
      filter6 = cluster_result_df['FixedByID'] != cluster_result_df['FixedByID'][ind] #Fixed By Same Developer (Class)
      similarIssues = cluster_result_df.where(filter1 & filter2)
      if(len(similarIssues.dropna()) > 0):    
        if(len(cluster_result_df.where(filter2 & filter4 & filter5 & filter6).dropna()) > 0): #Get List of Issues on the same cluster with different Pull Request     
              FP = FP + 1  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : FP")
        elif(len(cluster_result_df.where(filter1 & filter2 & filter3).dropna()) > 0 ): #Get List of Issue with Same Pull Requests (Class) but Different Cluster 
              FN = FN + 1  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : FN" )
        elif(len(cluster_result_df.where(filter4 & filter5 & filter6).dropna()) == 0): #Get the List issues same cluster with Different Pull Request
              TP = TP + 1 
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two: TP" )  
        else:
              TN = TN + 1 
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two : TN")       
      else: 
              TP = TP + 1 # Single Issue Fixed By Pull Request  
              print("Cluster: ", cluster_result_df['Cluster'][ind], "PullRequestID: ", cluster_result_df['PullRequestID'][ind],"Issue: ", cluster_result_df['IssueID'][ind], "Two: TP") 
   print("Total :", "TP :", TP, "TN :", TN, "FP :", FP, "FN :", FN) 
   return TP,FP,FN

def calculate_results(true_positive, false_positive, false_negative):
    if true_positive + false_positive > 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 0
    if true_positive + false_negative > 0:
       recall = true_positive / (true_positive + false_negative)
    else:
         recall = 0
    if precision + recall > 0:
         f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return precision, recall, f1