In [24]:
import pandas as pd
import multiprocessing
import pickle
import numpy as np
import difflib
import regex as re

from math import sqrt
from time import time
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
from kneed import KneeLocator

In [2]:
cpu_number = multiprocessing.cpu_count()
w2v_window= 7

In [3]:
solver_total_errors_df= pd.read_csv('solver-error-clean-data.csv')

with open("solver-errors-clean-logs.txt", "rb") as fp:
    tokenized_error_logs = pickle.load(fp)

In [4]:
def detect_embedding_size(tokens):
    flat_list = [item for row in tokens for item in row]
    vocab = set(flat_list)
    embedding_size = round(len(vocab) ** (2/3))
    if embedding_size >= 400:
        embedding_size = 400
    return embedding_size

w2v_size = detect_embedding_size(tokenized_error_logs)

In [5]:
def tokens_vectorization(tokenized_error_logs, w2v_size, w2v_window, cpu_number, model_name):
    iterations = 100
    word2vec = Word2Vec(tokenized_error_logs,
                           size = w2v_size, 
                           window = w2v_window, 
                           min_count=1, 
                           workers = cpu_number,
                           iter=iterations)
    word2vec.save(model_name)
    return word2vec

In [6]:
word2vec = tokens_vectorization(tokenized_error_logs, 
                                 w2v_size = w2v_size, 
                                 w2v_window= w2v_window, 
                                 cpu_number = cpu_number, 
                                 model_name='word2vec.model')

In [25]:
def sentence_vectorization(tokenized_error_logs, word2vec):
    sent2vec = []
    for sent in tokenized_error_logs:
        sent_vec = []
        numw = 0
        for w in sent:
            try:
                sent_vec = word2vec[w] if numw == 0 else np.add(sent_vec, word2vec[w])
                numw += 1
            except Exception:
                pass
        sent2vec.append(np.asarray(sent_vec) / numw)
    return np.array(sent2vec)

In [26]:
sent2vec = sentence_vectorization(tokenized_error_logs, word2vec)

  



In [40]:
kmeans = KMeans(n_clusters=100, random_state=0).fit(sent2vec)
cluster_labels = kmeans.labels_

In [41]:
cluster_labels

array([77,  6, 25, ..., 61, 48, 46], dtype=int32)

In [42]:
solver_total_errors_df['cluster'] = cluster_labels
solver_total_errors_df['clean-messages'] = tokenized_error_logs

In [43]:
solver_total_errors_df

Unnamed: 0,package_name,package_version,index_url,type,command,message,return_code,stderr,stdout,timeout,clean-messages,cluster
0,tryton,3.2.20,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting tryton==3.2.20\n Downloading https...,60.0,"[Command, exited, non-zero, status, code, ERRO...",77
1,pyobjc-framework-contacts,4.0b1,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting pyobjc-framework-contacts==4.0b1\n ...,60.0,"[Command, exited, non-zero, status, code, ERRO...",6
2,numpy,1.14.0,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting numpy==1.14.0\n Downloading https:...,60.0,"[Command, exited, non-zero, status, code, ERRO...",25
3,hachoir-core,1.3,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting hachoir-core==1.3\n Downloading ht...,60.0,"[Command, exited, non-zero, status, code, ERRO...",95
4,xattr,0.7.7,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting xattr==0.7.7\n Downloading https:/...,60.0,"[Command, exited, non-zero, status, code, ERRO...",61
...,...,...,...,...,...,...,...,...,...,...,...,...
1619,geventhttpclient,1.2.0,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting geventhttpclient==1.2.0\n Download...,60.0,"[Command, exited, non-zero, status, code, ERRO...",58
1620,buildbot-waterfall-view,0.9.0b9,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting buildbot-waterfall-view==0.9.0b9\n ...,60.0,"[Command, exited, non-zero, status, code, ERRO...",49
1621,cryptography,0.2.1,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting cryptography==0.2.1\n Downloading ...,60.0,"[Command, exited, non-zero, status, code, ERRO...",61
1622,jsonnet,v0.8.7-pre2,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting jsonnet==v0.8.7-pre2\n Downloading...,60.0,"[Command, exited, non-zero, status, code, ERRO...",48


In [44]:
def in_cluster(all_cluster_labels, cluster_label):
    results = []
    for idx, l in enumerate(all_cluster_labels):
        if l == cluster_label:
            results.append(solver_total_errors_df['clean-messages'].values[idx])
    return results

In [45]:
cluster_errors = {}
unique_errors = set()

for cluster_no in set(cluster_labels):
    #unique_errors = set()
    for log_err in in_cluster(solver_total_errors_df['cluster'], int(cluster_no)):
        [unique_errors.add(t) for t in log_err if re.match(r".*Error.*", t)]
    cluster_errors[cluster_no] = unique_errors

In [46]:
cluster_errors

{0: {'AttributeError', 'FileNotFoundError', 'NameError'},
 1: set(),
 2: set(),
 3: set(),
 4: {'FileNotFoundError', 'KeyError', 'PermissionError', 'RuntimeError'},
 5: {'RuntimeError'},
 6: {'FileNotFoundError'},
 7: {'AttributeError'},
 8: set(),
 9: {'ModuleNotFoundError'},
 10: set(),
 11: set(),
 12: {'CalledProcessError',
  'DistutilsError',
  'EOFError',
  'ImportError',
  'RuntimeError'},
 13: {'SyntaxError'},
 14: set(),
 15: set(),
 16: {'Error'},
 17: {'Error', 'HTTPError', 'ImportError', 'urllib.error.HTTPError'},
 18: set(),
 19: {'CCompilerError',
  'DistutilsPlatformError',
  'IOError',
  'ImportError',
  'OSError',
  'SyntaxError'},
 20: set(),
 21: {'AttributeError', 'ModuleNotFoundError', 'ValueError'},
 22: set(),
 23: set(),
 24: set(),
 25: {'RuntimeError'},
 26: {'CalledProcessError', 'DistutilsError'},
 27: set(),
 28: set(),
 29: {'ASNLookupError',
  'AttributeError',
  'HostLookupError',
  'IPDefinedError',
  'ImportError',
  'KeyError',
  'ModuleNotFoundError'

In [37]:
unique_errors

{'APIError',
 'ASNLookupError',
 'AssertionError',
 'AttributeError',
 'CCompilerError',
 'CalledProcessError',
 'CompileError',
 'ConfigurationError',
 'DistutilsError',
 'DistutilsPlatformError',
 'EOFError',
 'EnvironmentError',
 'Error',
 'FileNotFoundError',
 'HTTPError',
 'HostLookupError',
 'IOError',
 'IPDefinedError',
 'ImportError',
 'InvalidRequestError',
 'KeyError',
 'ModuleNotFoundError',
 'NameError',
 'NotImplementedError',
 'OSError',
 'OperationFailedError',
 'PaymentError',
 'PermissionError',
 'RuntimeError',
 'SchemaError',
 'SyntaxError',
 'TabError',
 'TypeError',
 'UnicodeDecodeError',
 'ValidationError',
 'ValueError',
 'VerificationError',
 'WhoisLookupError',
 'builder.BuildError',
 'cffi.VerificationError',
 'commands.CommandError',
 'exitIfError',
 'urllib.error.HTTPError',
 'zipimport.ZipImportError'}