In [0]:
pip install wikipedia 

In [0]:
pip install pyspellchecker

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
import re
import wikipedia

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


data = [
    "Visual Studio 2019 crashing when click RMB on rule in Analyzers' dependencies",#https://github.com/dotnet/roslyn/issues/40720
    "Avoid crash on concat on structs with ToString member", #https://github.com/dotnet/roslyn/pull/38860/commits
    "Enum implicit cast to string fails when element is named ToString", #https://github.com/dotnet/roslyn/issues/40256
    "Enum with ToString member crashes in string concatenation", #https://github.com/dotnet/roslyn/issues/38858   
    "Crash on right click a Analyze rule in Solution-Explorer", #https://github.com/dotnet/roslyn/issues/36304
    "Handle lazy loading of analyzer command handlers", #https://github.com/dotnet/roslyn/pull/36740
    ]

# tensroflow hub module for Universal sentence Encoder 
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
embed = hub.Module(module_url)
stop_words = set(stopwords.words('english')) 

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = text.strip() #Remove white space from beginning and ending
    return text

def camel_case_split(tokens): 
   words = []
   word_tokens = word_tokenize(tokens)
   for token in word_tokens:
        words = [[token[0]]]   
        for c in tokens[1:]: 
            if words[-1][-1].islower() and c.isupper(): 
                words.append(list(c)) 
            else: 
                words[-1].append(c)   
   return words

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    word_tokens = word_tokenize(tokens) 
    for token in word_tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)  
    return lemma_list

def stemm(tokens):  
  ps = PorterStemmer()
  stem_list = []
  word_tokens = word_tokenize(tokens) 
  for w in word_tokens:
      rootWord = ps.stem(w)
      stem_list.append(rootWord)
  return stem_list

def correct_spelling(tokens): 
    #print(tokens)
    spell = SpellChecker()
    spellchecked_list = []
    # find those words that may be misspelled
    word_tokens = word_tokenize(tokens) 
    for w in word_tokens:
      alist = []
      alist.append(w)
      if len(spell.unknown(alist)) == 0:
         spellchecked_list.append(w)
      else:
         spellchecked_list.append(wikisuggestion(w))
    #print(spellchecked_list)
    return spellchecked_list

def wikisuggestion(token):
    spell = SpellChecker()
    wiki_list = wikipedia.search(token)
    if len(wiki_list) == 0:  # No Suggested Word from Wiki, Correct Spelling with Python Spelling Checker
        return spell.correction(token)
    else:
        for wl in wiki_list:
           if wl in data: # Suggested the closest word based on the context. 
              return wl
           else:
              result = wikipedia.search(wl)[0]
              result = re.sub("[\(\[].*?[\)\]]", "", result)
              return result
   
def process_all(text):
    text = process_text(text)
    #text = ' '.join(remove_stopwords(stop_words, text.split()))
    #text = ' '.join(camel_case_split(text))
    text = ' '.join(correct_spelling(text))
    #text = ' '.join(stemm(text))
    #text = ' '.join(lemmatize(text))    
    return text

def unique_words(sentence):
    return set(sentence.lower().split())

def feature_names(data):
    uniquewords= []
    for s in data: 
       words = unique_words(s)
       for w in words:
         if w not in uniquewords: 
           uniquewords.append(w)
    return uniquewords


data_processed = list(map(process_all, data))
BASE_VECTORS = get_features(data_processed)

true_k = 2
model = KMeans(n_clusters=true_k, init='random', max_iter=100, n_init=1)
model.fit(BASE_VECTORS)


print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = feature_names(data_processed)

#print assign vector
i = 0
for l in model.labels_:
    print(l, " : ", data_processed[i])  
    i = i +  1


In [58]:
BASE_VECTORS.shape

(6, 512)

In [0]:
print("\n")
print("Prediction")

Y = process_all("Crash on right click a Analyze rule in Solution-Explorer")
Y = get_features(Y)
prediction = model.predict(Y)
print(prediction)

Y = process_all("Visual Studio 2019 crashing when click RMB on rule in Analyzers' dependencies")
Y = get_features(Y)
prediction = model.predict(Y)
print(prediction)

In [0]:
###Visualize 
 ##https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

X = np.array(BASE_VECTORS)
y_kmeans = model.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = model.cluster_centers_
labels = model.labels_
print(centers.shape)
colours = ListedColormap(['r','b','g'])
plt.scatter(centers[:, 0], centers[:, 1], alpha=1,label=centers.shape[0], cmap=colours);

In [84]:
num_points = 100
dimensions = 2
points = np.random.uniform(0, 1000, [num_points, dimensions])

def input_fn():
  #return tf.train.limit_epochs(
  #    BASE_VECTORS, num_epochs=1)
  return tf.train.limit_epochs(
      tf.convert_to_tensor(points, dtype=tf.float32), num_epochs=1)


num_clusters = 2
kmeans = tf.estimator.experimental.KMeans(
    num_clusters=num_clusters, use_mini_batch=False)

# train
num_iterations = 10
previous_centers = None

def ProcessKMean():
  for _ in range(num_iterations):
    kmeans.train(input_fn())
    cluster_centers = kmeans.cluster_centers()
    if previous_centers is not None:
      print('delta:', cluster_centers - previous_centers)
    previous_centers = cluster_centers
    print('score:', kmeans.score(input_fn))
  print('cluster centers:', cluster_centers)

def TrainModel():
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(ProcessKMean())

def MapInputToClusters():
    # map the input points to their clusters
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))
    for i, point in enumerate(points):
      cluster_index = cluster_indices[i]
      center = cluster_centers[cluster_index]
      print('point:', point, 'is in cluster', cluster_index, 'centered at', center)

print(ProcessKMean())


INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp096kjz42', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ffa68e2c5f8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp096kjz42', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ffa68e2c5f8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


TypeError: ignored