In [0]:
pip install wikipedia

In [0]:
pip install pyspellchecker

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
import re
import wikipedia
import pandas as pd
import datetime

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

data = pd.read_csv('IssueRoslyn.csv', encoding='latin-1')

In [0]:
# tensroflow hub module for Universal sentence Encoder 
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)
stop_words = set(stopwords.words('english')) 

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = text.strip() #Remove white space from beginning and ending
    return text

def camel_case_split(tokens): 
   words = []
   word_tokens = word_tokenize(tokens)
   for token in word_tokens:
        words = [[token[0]]]   
        for c in tokens[1:]: 
            if words[-1][-1].islower() and c.isupper(): 
                words.append(list(c)) 
            else: 
                words[-1].append(c)   
   return words

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    word_tokens = word_tokenize(tokens) 
    for token in word_tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)  
    return lemma_list

def stemm(tokens):  
  ps = PorterStemmer()
  stem_list = []
  word_tokens = word_tokenize(tokens) 
  for w in word_tokens:
      rootWord = ps.stem(w)
      stem_list.append(rootWord)
  return stem_list

def correct_spelling(tokens): 
    #print(tokens)
    spell = SpellChecker()
    spellchecked_list = []
    # find those words that may be misspelled
    word_tokens = word_tokenize(tokens) 
    for w in word_tokens:
      alist = []
      alist.append(w)
      if len(spell.unknown(alist)) == 0:
         spellchecked_list.append(w)
      else:
         spellchecked_list.append(wikisuggestion(w))
    #print(spellchecked_list)
    return spellchecked_list

def wikisuggestion(token):
    spell = SpellChecker()
    wiki_list = wikipedia.search(token)
    if len(wiki_list) == 0:  # No Suggested Word from Wiki, Correct Spelling with Python Spelling Checker
        return spell.correction(token)
    else:
        for wl in wiki_list:
           if wl in data: # Suggested the closest word based on the context. 
              return wl
           else:
              result = wikipedia.search(wl)[0]
              result = re.sub("[\(\[].*?[\)\]]", "", result)
              return result
   
def process_all(text):
    text = process_text(text)
    #text = ' '.join(remove_stopwords(stop_words, text.split()))
    #text = ' '.join(camel_case_split(text))
    text = ' '.join(correct_spelling(text))
    #text = ' '.join(stemm(text))
    #text = ' '.join(lemmatize(text))    
    return text

def unique_words(sentence):
    return set(sentence.lower().split())

def feature_names(data):
    uniquewords= []
    for s in data: 
       words = unique_words(s)
       for w in words:
         if w not in uniquewords: 
           uniquewords.append(w)
    return uniquewords

def estimate_clusters(data):
   totalyear = len(pd.to_datetime(data['CreatedDate']).dt.strftime("%y").drop_duplicates().tolist())
   totalissuescount = len(data)
   value = (totalissuescount/totalyear)/12
   if(value > 1): 
     return value
   else: 
     return 2

In [16]:
data_processed = list(map(process_all, list(data['Description'])))
BASE_VECTORS = get_features(data_processed)

true_k = int(estimate_clusters(data)) 
model = KMeans(n_clusters=true_k, init='random', max_iter=true_k, n_init=1)
model.fit(BASE_VECTORS)


print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = feature_names(data_processed)

#print assign vector
i = 0
cluster_result_df = pd.DataFrame(columns=['Cluster', 'PullRequestID', 'IssueID'])

for l in model.labels_:
    print(l, " : ", data_processed[i], data['PullRequestID'][i])  
    new_row = {'Cluster' : l , 'PullRequestID' : data['PullRequestID'][i], 'IssueID' : data['IssueID'][i]}
     #append row to the dataframe
    cluster_result_df.append(new_row, ignore_index=True)
    i = i +  1

### Save Cluster Result to CSV
cluster_result_df.to_csv('out.csv', index=False)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Top terms per cluster:
1  :  for example : of ` \microsoft.codeanalysis.csharp.emit.unittests.csproj ] 2020-02-12t18:37:16.5354826z [ error ] XUnit ( 0.0.0.0 ) : error : ( netcore_engineering_telemetry=test ) tests failed : f : \workspace\_work\1\s\artifacts\testresults\release\microsoft.codeanalysis.csharp.emit.unittests_netcoreapp3.1_x64.html [ netcoreapp3.1|x64 ] of ` 41630
1  :  compareallbytesemitted_debug console log summary there is a difference in the expected set of pdb bytes . long binary diff builds Push–pull strategy request | test failure Count | Quotation mark - | Quotation mark - | Quotation mark - | | [ 516959 ] ( | [ 517087 ] ( | [ 517683 ] ( | [ 517894 ] ( | [ 517914 ] ( | [ 517939 ] ( configurations - windows .NET Core release 41630
0  :  see the behavior in this test : of C-sharp [ fact ] public void localfunctionattribute ( ) { Constant  string text = @ Quotation mark using system ; class a : attribute { } class c { static void m ( ) { void local < [ a ] t > ( ) { 

In [0]:
def evaluate_clusters(cluster_result_df)
   TP = 0 
   FP = 0
   TN = 0 
   FN = 0 
   ##Calcuate True Positive Links 
   clusters = cluster_result_df.Cluster.unique()
   for row in cluster_result_df: 
      #Get List of Issues with same pullrequest ID
         

   

In [13]:
for ind in data.index:
  filter1 = data['PullRequestID'] == data['PullRequestID'][ind] #Same Pull Request
  filter2 = data['IssueID'] != data['IssueID'][ind] #Different Issue
  filter3 = data['Cluster'] != data['Cluster'][ind]
  similarIssues = data.where(filter1 & filter2)
  if(len(similarIssues.dropna()) > 0):
    #if(len(similarIssues.dropna()) > 2):
      #if(data.where(filter1 & filter2 & filter3).dropna() > 0 ): #Not in the Same Cluster as rest      
          #print("Issue: ", data['IssueID'][ind], "More Than 2 : FN" )
      #else: 
       #   print("Issue: ", data['IssueID'][ind], "More Than 2 : TN" )
    #else:
    if(len(data.where(filter1 & filter2 & filter3).dropna()) > 0 ): #Not in the Same Cluster as rest      
          print("Issue: ", data['IssueID'][ind], "Two : FN" )
    else: 
          print("Issue: ", data['IssueID'][ind], "Two: TN" )     
  else:
      print("Issue: ", data['IssueID'][ind], 1)

['for example : of ` \\microsoft.codeanalysis.csharp.emit.unittests.csproj ] 2020-02-12t18:37:16.5354826z [ error ] XUnit ( 0.0.0.0 ) : error : ( netcore_engineering_telemetry=test ) tests failed : f : \\workspace\\_work\\1\\s\\artifacts\\testresults\\release\\microsoft.codeanalysis.csharp.emit.unittests_netcoreapp3.1_x64.html [ netcoreapp3.1|x64 ] of `',
 'compareallbytesemitted_debug console log summary there is a difference in the expected set of pdb bytes . long binary diff builds Push–pull strategy request | test failure Count | Quotation mark - | Quotation mark - | Quotation mark - | | [ 516959 ] ( | [ 517087 ] ( | [ 517683 ] ( | [ 517894 ] ( | [ 517914 ] ( | [ 517939 ] ( configurations - windows .NET Core release',
 "see the behavior in this test : of C-sharp [ fact ] public void localfunctionattribute ( ) { Constant  string text = @ Quotation mark using system ; class a : attribute { } class c { static void m ( ) { void local < [ a ] t > ( ) { } } } Quotation mark ; var tree = 