In [0]:
pip install wikipedia 

In [0]:
pip install pyspellchecker

In [6]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
import re
import wikipedia

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker

# tensroflow hub module for Universal sentence Encoder 
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)
stop_words = set(stopwords.words('english')) 

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = text.strip() #Remove white space from beginning and ending
    return text

def camel_case_split(tokens): 
   words = []
   word_tokens = word_tokenize(tokens)
   for token in word_tokens:
        words = [[token[0]]]   
        for c in tokens[1:]: 
            if words[-1][-1].islower() and c.isupper(): 
                words.append(list(c)) 
            else: 
                words[-1].append(c)   
   return words

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    word_tokens = word_tokenize(tokens) 
    for token in word_tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)  
    return lemma_list

def stemm(tokens):  
  ps = PorterStemmer()
  stem_list = []
  word_tokens = word_tokenize(tokens) 
  for w in word_tokens:
      rootWord = ps.stem(w)
      stem_list.append(rootWord)
  return stem_list

def correct_spelling(tokens): 
    #print(tokens)
    spell = SpellChecker()
    spellchecked_list = []
    # find those words that may be misspelled
    word_tokens = word_tokenize(tokens) 
    for w in word_tokens:
      alist = []
      alist.append(w)
      if len(spell.unknown(alist)) == 0:
         spellchecked_list.append(w)
      else:
         spellchecked_list.append(wikisuggestion(w))
    #print(spellchecked_list)
    return spellchecked_list

def wikisuggestion(token):
    spell = SpellChecker()
    wiki_list = wikipedia.search(token)
    if len(wiki_list) == 0:  # No Suggested Word from Wiki, Correct Spelling with Python Spelling Checker
        return spell.correction(token)
    else:
        for wl in wiki_list:
           if wl in data: # Suggested the closest word based on the context. 
              return wl
           else:
              result = wikipedia.search(wl)[0]
              result = re.sub("[\(\[].*?[\)\]]", "", result)
              return result
   
def process_all(text):
    text = process_text(text)
    text = ' '.join(remove_stopwords(stop_words, text.split()))
    #text = ' '.join(camel_case_split(text))
    text = ' '.join(correct_spelling(text))
    text = ' '.join(stemm(text))
    text = ' '.join(lemmatize(text))
    
    return text

def cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return np.dot(v1, v2) / (mag1 * mag2)

def test_similarity(text1, text2):
    vec1 = get_features(text1)[0]
    vec2 = get_features(text2)[0]
    print(vec1.shape)
    return cosine_similarity(vec1, vec2)

def semantic_search(query, data, vectors):
    query = process_all(query)
    print("Extracting features...")
    query_vec = get_features(query)[0].ravel()
    res = []
    for i, d in enumerate(data):
        qvec = vectors[i].ravel()
        sim = cosine_similarity(query_vec, qvec)
        res.append((sim, d[:100], i))
    return sorted(res, key=lambda x : x[0], reverse=True)

def unique_words(sentence):
    return set(sentence.lower().split())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
data = [
    "Avoid crash on concat on structs with ToString member", #https://github.com/dotnet/roslyn/pull/38860/commits
    "Enum implicit cast to string fails when element is named ToString", #https://github.com/dotnet/roslyn/issues/40256
    "Enum with ToString member crashes in string concatenation", #https://github.com/dotnet/roslyn/issues/38858
    "Visual Studio 2019 crashing when click RMB on rule in Analyzers' dependencies",#https://github.com/dotnet/roslyn/issues/40720
    "Crash on right click a Analyze rule in Solution-Explorer", #https://github.com/dotnet/roslyn/issues/36304
    "Handle lazy loading of analyzer command handlers", #https://github.com/dotnet/roslyn/pull/36740
]

In [10]:
data_processed = list(map(process_all, data))
BASE_VECTORS = get_features(data_processed)
semantic_search("Handle lazy loading of analyzer command handler", data_processed, BASE_VECTORS)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Extracting features...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


[(1.0, 'handl lazi load analyz command handler', 5),
 (0.57930964, 'crash right click analyz rule solut', 4),
 (0.55621374, 'enumer type java member crash string concaten', 2),
 (0.5314942,
  "visual studio 2019 crash click renminbi rule spectrum analyz ' depend",
  3),
 (0.5302365, 'avoid crash concaten record java member', 0),
 (0.45067155, 'enumer type implicit cast string fail element name java', 1)]

In [0]:
data_processed

In [36]:
semantic_search("Visual Studio 2019 crashing when click RMB on rule in Analyzers' dependencies", data_processed, BASE_VECTORS)

visual studio 2019 crashing click rmb rule analyzers' dependencies
['visual', 'studio', '2019', 'crashing', 'click', 'Renminbi', 'rule', 'Spectrum analyzer', "'", 'dependencies']
Extracting features...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


[(1.0000001,
  "visual studio 2019 crash click renminbi rule spectrum analyz ' depend",
  3),
 (0.7122575,
  'avoid crash concaten record ( comput scienc ) java ( program languag ) member',
  0),
 (0.63621086,
  'enumer type java ( program languag ) member crash string concaten',
  2),
 (0.6034917,
  'enumer type implicit cast string fail element name java ( program languag )',
  1),
 (0.58907, 'crash right click analyz rule solut', 4),
 (0.5314943, 'handl lazi load analyz command handler', 5)]

In [0]:
#Compare with Code
issuereport_uniquewordlist = unique_words(' '.join(data[3:6]))
issuereport_cluser1 = ' '.join(issuereport_uniquewordlist)
codedata = ['project should other severity link get changed add item name items with hierarchy create checked show help initialize rule active context handler new folder enabled any controller menu diagnostics copy file update diagnostic analyzer open analyzers selected command remove set', 
            'Analyzer Command Handler Tests Traits Features Diagnostics Diagnostic Context Menu Controller Solution Explorer']
codedata_processed = list(map(process_text, codedata))          
code_BASE_VECTORS = get_features(codedata)

print(issuereport_cluser1)
semantic_search(issuereport_cluser1, codedata_processed, code_BASE_VECTORS)