In [1]:
import pandas as pd
import multiprocessing
import pickle
import numpy as np
import difflib
import regex as re

from math import sqrt
from time import time
from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
from sklearn.neighbors import NearestNeighbors
from gensim.models import Word2Vec
from kneed import KneeLocator

In [2]:
cpu_number = multiprocessing.cpu_count()

w2v_window= 7

In [3]:
solver_total_errors_df= pd.read_csv('solver-error-clean-data.csv')

with open("solver-errors-clean-clustering-data.txt", "rb") as fp:
    clean_clustering_data = pickle.load(fp)

In [4]:
clean_clustering_data

0       [SyntaxError, Missing, parentheses, in, call, ...
1       [FileNotFoundError, Errno, No, such, file, or,...
2       [raise, RuntimeError, Broken, toolchain, can, ...
3                          [SyntaxError, invalid, syntax]
4       [distutils, errors, DistutilsExecError, comman...
                              ...                        
1619    [ERROR, Failed, building, wheel, for, geventht...
1620    [ModuleNotFoundError, No, module, named, build...
1621    [distutils, errors, DistutilsExecError, comman...
1622    [ERROR, Failed, building, wheel, for, jsonnet,...
1623                       [SyntaxError, invalid, syntax]
Name: clean_clustering_data, Length: 1624, dtype: object

In [44]:
def detect_embedding_size(tokens):
    flat_list = [item for row in tokens for item in row]
    vocab = set(flat_list)
    embedding_size = round(len(vocab) ** (2/3))
    if embedding_size >= 400:
        embedding_size = 400
    return embedding_size

w2v_size = detect_embedding_size(clean_clustering_data)
w2v_size

153

In [6]:
def tokens_vectorization(clustering_data, w2v_size, w2v_window, cpu_number, model_name):
    iterations = 100
    word2vec = Word2Vec(clustering_data,
                           size = w2v_size, 
                           window = w2v_window, 
                           min_count=1, 
                           workers = cpu_number,
                           iter=iterations)
    word2vec.save(model_name)
    return word2vec

In [7]:
word2vec = tokens_vectorization(clean_clustering_data, 
                                 w2v_size = w2v_size, 
                                 w2v_window= w2v_window, 
                                 cpu_number = cpu_number, 
                                 model_name='word2vec.model')

In [8]:
def sentence_vectorization(clustering_data, word2vec):
    sent2vec = []
    for sent in clustering_data:
        sent_vec = []
        numw = 0
        for w in sent:
            try:
                sent_vec = word2vec[w] if numw == 0 else np.add(sent_vec, word2vec[w])
                numw += 1
            except Exception:
                pass
        sent2vec.append(np.asarray(sent_vec) / numw)   
    return np.vstack(sent2vec)

In [9]:
sent2vec = sentence_vectorization(clean_clustering_data, word2vec)

  


In [10]:
sent2vec.shape

(1624, 153)

In [11]:
def kneighbors(sent2vec):
    k = round(sqrt(len(sent2vec)))
    neigh = NearestNeighbors(n_neighbors=k)
    nbrs = neigh.fit(sent2vec)
    distances, indices = nbrs.kneighbors(sent2vec)
    distances = [np.mean(d) for d in np.sort(distances, axis=0)]
    return distances

avg_distances = kneighbors(sent2vec)

In [12]:
def epsilon_search(distances):
    kneedle = KneeLocator(distances, list(range(len(distances))))
    epsilon = max(kneedle.all_elbows) if (len(kneedle.all_elbows) > 0) else 1
    return epsilon


def hierarchical(epsilon, sent2vec):
        cluster_labels = AgglomerativeClustering(n_clusters=None,
                                                 distance_threshold=epsilon).fit_predict(sent2vec)
        return cluster_labels

In [13]:
epsilon = epsilon_search(avg_distances)
cluster_labels = hierarchical(epsilon, sent2vec)

In [14]:
cluster_labels

array([ 48,  20,  42, ...,  51,  23, 220])

In [15]:
solver_total_errors_df['cluster'] = cluster_labels

In [16]:
solver_total_errors_df.head(10)

Unnamed: 0,index,package_name,package_version,index_url,type,command,message,return_code,stderr,stdout,timeout,Error_info,command_info,cwd,Complete_output,ERROR,specific_error,clustering_data,clean_clustering_data,cluster
0,0,tryton,3.2.20,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting tryton==3.2.20\n Downloading https...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-_gczw75i/tryton/'],"['Complete output (6 lines):', 'Traceback (mos...",['ERROR: Command errored out with exit status ...,['SyntaxError: Missing parentheses in call to ...,['SyntaxError: Missing parentheses in call to ...,"['SyntaxError', 'Missing', 'parentheses', 'in'...",48
1,1,pyobjc-framework-contacts,4.0b1,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting pyobjc-framework-contacts==4.0b1\n ...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-t57p8sf3/pyobjc-framew...,"['Complete output (15 lines):', 'Traceback (mo...",['ERROR: Command errored out with exit status ...,"[""FileNotFoundError: [Errno 2] No such file or...","[""FileNotFoundError: [Errno 2] No such file or...","['FileNotFoundError', 'Errno', 'No', 'such', '...",20
2,2,numpy,1.14.0,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting numpy==1.14.0\n Downloading https:...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -u -c...,"['cwd: /tmp/pip-install-vul_9fm4/numpy/', 'cwd...","['Complete output (306 lines):', 'Running from...","['ERROR: Failed building wheel for numpy', 'ER...","['raise RuntimeError(""Broken toolchain: cannot...","['raise RuntimeError(""Broken toolchain: cannot...","['raise', 'RuntimeError', 'Broken', 'toolchain...",42
3,3,hachoir-core,1.3,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting hachoir-core==1.3\n Downloading ht...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-b_cqxm9t/hachoir-core/'],"['Complete output (6 lines):', 'Traceback (mos...",['ERROR: Command errored out with exit status ...,['SyntaxError: invalid syntax'],['SyntaxError: invalid syntax'],"['SyntaxError', 'invalid', 'syntax']",220
4,4,xattr,0.7.7,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting xattr==0.7.7\n Downloading https:/...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-x7gd4535/xattr/'],"['Complete output (102 lines):', ""WARNING: The...",['ERROR: Command errored out with exit status ...,"[""distutils.errors.DistutilsExecError: command...","[""distutils.errors.DistutilsExecError: command...","['distutils', 'errors', 'DistutilsExecError', ...",51
5,5,spacy,2.1.1.dev0,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting spacy==2.1.1.dev0\n Downloading ht...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 /home...,['cwd: /tmp/pip-install-43ioejjp/spacy'],"['Complete output (727 lines):', 'running bdis...","['ERROR: Failed building wheel for spacy', 'ER...",,"['ERROR: Failed building wheel for spacy', 'ER...","['ERROR', 'Failed', 'building', 'wheel', 'for'...",25
6,6,fastavro,0.16.5,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting fastavro==0.16.5\n Downloading htt...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -u -c...,"['cwd: /tmp/pip-install-k1e0u355/fastavro/', '...","['Complete output (26 lines):', 'running bdist...","['ERROR: Failed building wheel for fastavro', ...",,"['ERROR: Failed building wheel for fastavro', ...","['ERROR', 'Failed', 'building', 'wheel', 'for'...",65
7,7,pyeclib,1.6.0,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit status ...,Collecting pyeclib==1.6.0\n Downloading https...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -u -c...,"['cwd: /tmp/pip-install-t1z6_8hy/pyeclib/', 'c...","['Complete output (14 lines):', ""/usr/lib64/py...","['ERROR: Failed building wheel for pyeclib', '...",,"['ERROR: Failed building wheel for pyeclib', '...","['ERROR', 'Failed', 'building', 'wheel', 'for'...",55
8,8,happybase,0.8,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting happybase==0.8\n Downloading https...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-xr2ydtx4/happybase/'],"['Complete output (5 lines):', 'Traceback (mos...",['ERROR: Command errored out with exit status ...,"[""NameError: name 'execfile' is not defined""]","[""NameError: name 'execfile' is not defined""]","['NameError', 'name', 'execfile', 'is', 'not',...",109
9,9,mahotas,0.6.1,https://pypi.org/simple,command_error,/home/solver/venv/bin/python3 -m pip install -...,Command exited with non-zero status code (1): ...,1.0,ERROR: Command errored out with exit statu...,Collecting mahotas==0.6.1\n Downloading https...,60.0,Command exited with non-zero status code (1): ...,['command: /home/solver/venv/bin/python3 -c \'...,['cwd: /tmp/pip-install-biqp6olf/mahotas/'],"['Complete output (9 lines):', 'Traceback (mos...",['ERROR: Command errored out with exit status ...,"[""SyntaxError: Missing parentheses in call to ...","[""SyntaxError: Missing parentheses in call to ...","['SyntaxError', 'Missing', 'parentheses', 'in'...",131


In [35]:
def clustered_output(solver_total_errors_df, mode='INDEX'):
    groups = {}
    for key, value in solver_total_errors_df.groupby(['cluster']):
        if mode == 'ALL':
            groups[str(key)] = value.to_dict(orient='records')
        elif mode == 'INDEX':
            groups[str(key)] = value.index.values.tolist()
        elif mode == 'TARGET':
            groups[str(key)] = value[self.target].values.tolist()
        elif mode == 'CLEANED':
            groups[str(key)] = value['clean_clustering_data'].values.tolist()
    return groups

In [36]:
def find_matching_blocks(strings):
    curr = strings[0]
    if len(strings) == 1:
        return curr
    else:
        cnt = 1
        for i in range(cnt, len(strings)):
            matches = difflib.SequenceMatcher(None, curr, strings[i])
            common = []
            for match in matches.get_matching_blocks():
                common.append(curr[match.a:match.a + match.size])
            #curr = ''.join(common)
            curr = ','.join(str(v) for v in common)
            cnt = cnt + 1
            if cnt == len(strings):
                break
        if curr == '':
            'NO COMMON PATTERNS HAVE BEEN FOUND'
        return curr

def get_similarity(rows):
    s = []
    for i in range(0, len(rows)):
        s.append(difflib.SequenceMatcher(None, rows[0], rows[i]).ratio() * 100)
    return s

In [37]:
STATISTICS = ["cluster_name", "cluster_size", "pattern",
              "mean_length", "mean_similarity", "std_length", "std_similarity"]

def statistics(solver_total_errors_df, output_mode='frame'):
    """
    Returns dictionary with statistic for all clusters
    "cluster_name" - name of a cluster
    "cluster_size" = number of log messages in cluster
    "pattern" - all common substrings in messages in the cluster
    "vocab" - vocabulary of all messages within the cluster (without punctuation and stop words)
    "vocab_length" - the length of vocabulary
    "mean_length" - average length of log messages in cluster
    "std_length" - standard deviation of length of log messages in cluster
    "mean_similarity" - average similarity of log messages in cluster
    (calculated as the levenshtein distances between the 1st and all other log messages)
    "std_similarity" - standard deviation of similarity of log messages in cluster
    :param clustered_df:
    :param output_mode: frame | dict
    :return:
    """
    clusters = []
    clustered_df = clustered_output(solver_total_errors_df, mode='CLEANED')
    for item in clustered_df:
        row = clustered_df[item]
        matcher = find_matching_blocks(row)
        lengths = [len(s) for s in row]
        similarity = get_similarity(row)
        #tokens = Tokens(row, self.tokenizer)
        #tokens.process()
        # vocab = tokens.get_vocabulary()
        # vocab_length = len(vocab)
        clusters.append([item,
                         len(row),
                         matcher,
                         # vocab,
                         # vocab_length,
                         np.mean(lengths),
                         np.mean(similarity),
                         np.std(lengths) if len(row) > 1 else 0,
                         np.std(similarity)])
    df = pd.DataFrame(clusters, columns=STATISTICS).round(2).sort_values(by='cluster_size', ascending=False)
    if output_mode == 'frame':
        return df
    else:
        return df.to_dict(orient='records')

In [38]:
stat = statistics(solver_total_errors_df, output_mode='frame')

In [39]:
stat_df = pd.DataFrame.from_dict(stat)

In [40]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,pattern,mean_length,mean_similarity,std_length,std_similarity
76,76,13,"['ERROR', 'Failed', 'building', 'wheel', 'for'...",649.08,43.19,7.30,20.95
46,46,11,"['ERROR', 'Failed', 'building', 'wheel', 'for'...",657.55,48.07,11.10,24.10
55,55,13,"['ERROR', 'Failed', 'building', 'wheel', 'for'...",642.85,48.89,10.65,23.85
15,15,16,"['ERROR', 'Failed', 'building', 'wheel', 'for'...",651.75,51.23,10.55,16.89
50,50,7,"['ERROR', 'Failed', 'building', 'wheel', 'for'...",651.00,54.09,8.16,22.87
...,...,...,...,...,...,...,...
127,127,2,"['ModuleNotFoundError', 'No', 'module', 'named...",146.00,100.00,0.00,0.00
82,82,2,"['SyntaxError', 'Missing', 'parentheses', 'in'...",174.00,100.00,0.00,0.00
95,95,2,"['raise', 'RuntimeError', 'message', 'RuntimeE...",52.00,100.00,0.00,0.00
184,184,2,"['KeyError', 'getpwuid', 'uid', 'not', 'found'],",47.00,100.00,0.00,0.00


In [45]:
def in_cluster(all_cluster_labels, cluster_label):
    results = []
    for idx, l in enumerate(all_cluster_labels):
        if l == cluster_label:
            results.append(solver_total_errors_df['clean_clustering_data'].values[idx])
    return results

In [54]:
in_cluster(solver_total_errors_df['cluster'], 88)

["['ModuleNotFoundError', 'No', 'module', 'named', 'Cython', 'raise', 'RuntimeError', 'No', 'cython', 'installed', 'Please', 'run', 'pip', 'install', 'cython', 'RuntimeError', 'No', 'cython', 'installed', 'Please', 'run', 'pip', 'install', 'cython']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'Cython', 'raise', 'RuntimeError', 'No', 'cython', 'installed', 'Please', 'run', 'pip', 'install', 'cython', 'RuntimeError', 'No', 'cython', 'installed', 'Please', 'run', 'pip', 'install', 'cython']"]

In [47]:
in_cluster(solver_total_errors_df['cluster'], 1)

["['ERROR', 'Failed', 'building', 'wheel', 'for', 'guillotina', 'ERROR', 'Command', 'errored', 'out', 'with', 'exit', 'status', 'ERROR', 'Command', 'errored', 'out', 'with', 'exit', 'status', 'import', 'sys', 'setuptools', 'tokenize', 'sys', 'argv', 'tmp/pip-install', 'vqp', 'guillotina/setup', 'py', 'file', 'tmp/pip-install', 'vqp', 'guillotina/setup', 'py', 'getattr', 'tokenize', 'open', 'open', 'file', 'code', 'read', 'replace', 'close', 'exec', 'compile', 'code', 'file', 'exec', 'install', 'record', 'tmp/pip-record', 'lefg', 'install-record', 'txt', 'compile', 'install-headers', 'guillotina', 'Check', 'the', 'logs', 'for', 'full', 'command', 'output']",
 "['ERROR', 'Failed', 'building', 'wheel', 'for', 'atari', 'py', 'ERROR', 'Command', 'errored', 'out', 'with', 'exit', 'status', 'ERROR', 'Command', 'errored', 'out', 'with', 'exit', 'status', 'import', 'sys', 'setuptools', 'tokenize', 'sys', 'argv', 'tmp/pip-install', 'xkdx', 'atari-py/setup', 'py', 'file', 'tmp/pip-install', 'xkdx

In [48]:
in_cluster(solver_total_errors_df['cluster'], 6)

["['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'click']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'ply', 'ModuleNotFoundError', 'No', 'module', 'named', 'ply']",
 "['ModuleNotFoundError', 'No', 'module', 'named', 'ply', 'ModuleNotFoundError', 'No', 'module', 'named', 'ply']"]

In [49]:
in_cluster(solver_total_errors_df['cluster'], 3)

["['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'mutagen']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'deap']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'vobject']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'robotframework']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'matplotlib']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'potsdb']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'potsdb']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'pathmatch']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'pathmatch']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'newrelic']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'pylint']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'asset']",
 "['ERROR', 'No', 'matching', 'distribution', 'found', 'for', 'deap']",
 "['ERROR', 'No', 'ma

In [50]:
in_cluster(solver_total_errors_df['cluster'], 22)

["['except', 'OSError', 'ex', 'SyntaxError', 'invalid', 'syntax']",
 "['except', 'OSError', 'IOError', 'err', 'SyntaxError', 'invalid', 'syntax']"]

In [51]:
in_cluster(solver_total_errors_df['cluster'], 44)

["['raise', 'ImportError', 'ImportError', 'raise', 'HTTPError', 'req', 'full_url', 'code', 'msg', 'hdrs', 'fp', 'urllib', 'error', 'HTTPError', 'HTTP', 'Error', 'SSL', 'is', 'required']",
 "['raise', 'ImportError', 'ImportError', 'raise', 'HTTPError', 'req', 'full_url', 'code', 'msg', 'hdrs', 'fp', 'urllib', 'error', 'HTTPError', 'HTTP', 'Error', 'SSL', 'is', 'required']",
 "['raise', 'ImportError', 'ImportError', 'raise', 'HTTPError', 'req', 'full_url', 'code', 'msg', 'hdrs', 'fp', 'urllib', 'error', 'HTTPError', 'HTTP', 'Error', 'SSL', 'is', 'required']",
 "['raise', 'ImportError', 'ImportError', 'raise', 'HTTPError', 'req', 'full_url', 'code', 'msg', 'hdrs', 'fp', 'urllib', 'error', 'HTTPError', 'HTTP', 'Error', 'SSL', 'is', 'required']",
 "['raise', 'ImportError', 'ImportError', 'raise', 'HTTPError', 'req', 'full_url', 'code', 'msg', 'hdrs', 'fp', 'urllib', 'error', 'HTTPError', 'HTTP', 'Error', 'SSL', 'is', 'required']",
 "['raise', 'ImportError', 'ImportError', 'raise', 'HTTPErro