In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
import re
import wikipedia

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


data = [
    "Visual Studio 2019 crashing when click RMB on rule in Analyzers' dependencies",#https://github.com/dotnet/roslyn/issues/40720
    "Avoid crash on concat on structs with ToString member", #https://github.com/dotnet/roslyn/pull/38860/commits
    "Enum implicit cast to string fails when element is named ToString", #https://github.com/dotnet/roslyn/issues/40256
    "Enum with ToString member crashes in string concatenation", #https://github.com/dotnet/roslyn/issues/38858   
    "Crash on right click a Analyze rule in Solution-Explorer", #https://github.com/dotnet/roslyn/issues/36304
    "Handle lazy loading of analyzer command handlers", #https://github.com/dotnet/roslyn/pull/36740
    ]

# tensroflow hub module for Universal sentence Encoder 
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)
stop_words = set(stopwords.words('english')) 

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = text.strip() #Remove white space from beginning and ending
    return text

def camel_case_split(tokens): 
   words = []
   word_tokens = word_tokenize(tokens)
   for token in word_tokens:
        words = [[token[0]]]   
        for c in tokens[1:]: 
            if words[-1][-1].islower() and c.isupper(): 
                words.append(list(c)) 
            else: 
                words[-1].append(c)   
   return words

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    word_tokens = word_tokenize(tokens) 
    for token in word_tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)  
    return lemma_list

def stemm(tokens):  
  ps = PorterStemmer()
  stem_list = []
  word_tokens = word_tokenize(tokens) 
  for w in word_tokens:
      rootWord = ps.stem(w)
      stem_list.append(rootWord)
  return stem_list

def correct_spelling(tokens): 
    #print(tokens)
    spell = SpellChecker()
    spellchecked_list = []
    # find those words that may be misspelled
    word_tokens = word_tokenize(tokens) 
    for w in word_tokens:
      alist = []
      alist.append(w)
      if len(spell.unknown(alist)) == 0:
         spellchecked_list.append(w)
      else:
         spellchecked_list.append(wikisuggestion(w))
    #print(spellchecked_list)
    return spellchecked_list

def wikisuggestion(token):
    spell = SpellChecker()
    wiki_list = wikipedia.search(token)
    if len(wiki_list) == 0:  # No Suggested Word from Wiki, Correct Spelling with Python Spelling Checker
        return spell.correction(token)
    else:
        for wl in wiki_list:
           if wl in data: # Suggested the closest word based on the context. 
              return wl
           else:
              result = wikipedia.search(wl)[0]
              result = re.sub("[\(\[].*?[\)\]]", "", result)
              return result
   
def process_all(text):
    text = process_text(text)
    text = ' '.join(remove_stopwords(stop_words, text.split()))
    #text = ' '.join(camel_case_split(text))
    text = ' '.join(correct_spelling(text))
    #text = ' '.join(stemm(text))
    #text = ' '.join(lemmatize(text))    
    return text

def unique_words(sentence):
    return set(sentence.lower().split())

def feature_names(data):
    uniquewords= []
    for s in data: 
       words = unique_words(s)
       for w in words:
         if w not in uniquewords: 
           uniquewords.append(w)
    return uniquewords


data_processed = list(map(process_all, data))
BASE_VECTORS = get_features(data_processed)


def input_fn():
  return tf.train.limit_epochs(
      BASE_VECTORS, num_epochs=1)

num_clusters = 2
cluster_centerlist = []
kmeans = tf.estimator.experimental.KMeans(
    num_clusters=num_clusters, initial_clusters='random', use_mini_batch=False)
# train
num_iterations = 10
previous_centers = None
for _ in range(num_iterations):
  
  kmeans.train(input_fn)
  cluster_centers = kmeans.cluster_centers()
  #if previous_centers is not None:
    #print('delta:', cluster_centers - previous_centers)
    #print('delta:', cluster_centers - previous_centers)
  previous_centers = cluster_centers
  cluster_centerlist.append(cluster_centers)
  #print('score:', kmeans.score(input_fn))
#print('cluster centers:', cluster_centers)




In [22]:
print(cluster_centerlist[0])
print(cluster_centerlist[1])

[[ 0.04058949 -0.04058962  0.0336156  ...  0.04999132  0.0534329
  -0.01757501]
 [ 0.01112611 -0.04777005  0.08043283 ...  0.08039761  0.03552148
  -0.03916642]]
[[ 0.03608557 -0.04016418  0.0241436  ...  0.06572434  0.05265355
  -0.03146098]
 [ 0.01434285 -0.0173102   0.05734048 ...  0.04390229 -0.00920612
   0.01444885]]


In [23]:
cluster_indices = list(kmeans.predict_cluster_index(input_fn))
print(cluster_indices)





INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /tmp/tmpq_muisz7/model.ckpt-19


INFO:tensorflow:Restoring parameters from /tmp/tmpq_muisz7/model.ckpt-19


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


[1, 1, 0, 0, 1, 1]


In [0]:
# map the input points to their clusters
cluster_indices = list(kmeans.predict_cluster_index(input_fn))
for i, point in enumerate(BASE_VECTORS):
  cluster_index = cluster_indices[i]
  center = cluster_centers[cluster_index]
  print('point:', point, 'is in cluster', cluster_index, 'centered at', center)


In [23]:
cluster_indices

[1, 1, 1, 1, 0, 1]