In [14]:
## Primeira etapa - receber logs
log1 = "make deploy http://www.google.com"
log2 = "error http://www.google.com"
log3 = "make deploy http://www.microsoft.com"
log4 = "error http://www.microsoft.com"
log5 = "make deploy https://www.apple.com"
log6 = "error https://www.apple.com"

In [None]:
## Segunda etapa - identificar palavras em inglês? (facultativo - ver outro ipynb)

In [15]:
## Terceira etapa - vetorizar via Transformers

## Importa Sentence Transformers e compila o modelo
from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

## Junta as frases
sentences = [log1, log2, log3, log4, log5, log6]

## Faz encode nas frases
embeddings = model.encode(sentences)

## Printa os embeddings
'''
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
'''

## Verifica características dos embeddings
'''
print(type(embeddings[0]))
print(len(embeddings[0]))
print(embeddings[0].ndim)
'''

'\nprint(type(embeddings[0]))\nprint(len(embeddings[0]))\nprint(embeddings[0].ndim)\n'

In [19]:
## Clusteriza com UMAP e DBSCAN
import hdbscan
import umap

# Referência DBSCAN: https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html
# Referênia UMAP: https://umap-learn.readthedocs.io/en/latest/

## Parametriza clusterizador HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=2,
              min_samples=1,
              metric='euclidean',
              allow_single_cluster=False,                  
              cluster_selection_method='eom')

## Parametriza redutor UMAP
reducer = umap.UMAP(n_neighbors=2, n_components=1, spread=0.5, min_dist=0.0, metric='cosine')

## Roda clustering com UMAP
umap_data = reducer.fit_transform(embeddings)
hdb = clusterer.fit(umap_data)
## Roda clustering sem UMAP
#hdb = clusterer.fit(embeddings)

In [21]:
## Clusteriza com SKLearn
from sklearn.cluster import AgglomerativeClustering
import numpy as np

sk_clusterer = AgglomerativeClustering().fit(embeddings)
print(sk_clusterer)
print(sk_clusterer.labels_)

AgglomerativeClustering()
[1 0 1 0 1 0]


In [51]:
## Clusteriza com K-Means

from sklearn.cluster import KMeans

## Clusteriza com K-Means
num_clusters = 2
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for _ in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['make deploy http://www.google.com', 'make deploy http://www.microsoft.com', 'make deploy https://www.apple.com']

Cluster  2
['error http://www.google.com', 'error http://www.microsoft.com', 'error https://www.apple.com']



In [20]:
## Imprime labels dos clusters
clusterer.labels_

## Acha número máximo de clusters
#clusterer.labels_.max()


array([0, 1, 0, 1, 0, 1], dtype=int64)

In [None]:
## Primeira hipótese: usar Prefix Tree para identificar campos comuns e diferentes

In [34]:
## Segunda hipótese: usar LCS, depois verificar tokens comuns - implementação com programação dinâmica de LCS - complexidade O(n*m)

def lcs(x, y):
  # Acha tamanho das strings
  m = len(x)
  n = len(y)

  # Declara array para guardar os valores da programação dinâmica
  vals = [[None]*(n + 1) for _ in range(m + 1)]

  # Compara strings em ordem bottom-up
  for i in range(m + 1):
    for j in range(n + 1):
      if i == 0 or j == 0 :
        vals[i][j] = 0
      elif x[i-1] == y[j-1]:
        vals[i][j] = vals[i-1][j-1]+1
      else:
        vals[i][j] = max(vals[i-1][j], vals[i][j-1])

  # vals[m][n] vai conter o tamanho do LCS entre x[0..n-1] e y[0..m-1]
  return vals[m][n]

# Teste de LCs
print("O tamanho do LCS é", lcs(log1, log3))

O tamanho do LCS é 29


In [35]:
## Métodos de tokenização

teste = "017-09-26 12:40:15, INFO impl.FsDatasetImpl - Time taken to scan block pool BP-805143380 on /home/data3/current 30ms"

## Teste usando split
print ("Teste usando split")
print(teste.split())

## Teste usando NLTK
print ("Teste usando NLTK")
from nltk.tokenize import word_tokenize
print(word_tokenize(teste))

## Teste usando Gensim
print ("Teste usando Gensim")
from gensim.utils import tokenize
print(list(tokenize(teste)))

## Teste usando Spacy
print ("Teste usando Spacy")
import spacy
nlp = spacy.load("en_core_web_trf")
doc = nlp(teste)
tokens = [token.text for token in doc]
print(tokens)

Teste usando split
['017-09-26', '12:40:15,', 'INFO', 'impl.FsDatasetImpl', '-', 'Time', 'taken', 'to', 'scan', 'block', 'pool', 'BP-805143380', 'on', '/home/data3/current', '30ms']
Teste usando NLTK
['017-09-26', '12:40:15', ',', 'INFO', 'impl.FsDatasetImpl', '-', 'Time', 'taken', 'to', 'scan', 'block', 'pool', 'BP-805143380', 'on', '/home/data3/current', '30ms']
Teste usando Gensim
['INFO', 'impl', 'FsDatasetImpl', 'Time', 'taken', 'to', 'scan', 'block', 'pool', 'BP', 'on', 'home', 'data', 'current', 'ms']
Teste usando Spacy




['017', '-', '09', '-', '26', '12:40:15', ',', 'INFO', 'impl', '.', 'FsDatasetImpl', '-', 'Time', 'taken', 'to', 'scan', 'block', 'pool', 'BP-805143380', 'on', '/home', '/', 'data3', '/', 'current', '30ms']


In [49]:
## Compara duas strings para tokens similares - não leva em conta posição

## Usando NLTK
from nltk.tokenize import word_tokenize

def compare_strings_as_list(x,y):
    tokens_x = word_tokenize(x)
    tokens_y = word_tokenize(y)
    compared = []
    has_token = False

    for i in tokens_x:
        for j in tokens_y:
            if i == j:
                has_token = True
        if has_token == True:
            compared.append(i)
        else:
            compared.append("VARIABLE")
        has_token = False
    
    return compared

#print(word_tokenize(log1))
#print(word_tokenize(log3))
#print(compare_strings_as_list(log1, log3))

def compare_strings_as_dict(x,y):
    tokens_x = word_tokenize(x)
    tokens_y = word_tokenize(y)
    compared = {}
    has_token = False

    for i in tokens_x:
        for j in tokens_y:
            if i == j:
                has_token = True
        compared[i] = "STATIC" if has_token == True else "VARIABLE"
        has_token = False

    return compared

print(word_tokenize(log1))
print(word_tokenize(log3))
print(compare_strings_as_dict(log1, log3))

['make', 'deploy', 'http', ':', '//www.google.com']
['make', 'deploy', 'http', ':', '//www.microsoft.com']
{'make': 'STATIC', 'deploy': 'STATIC', 'http': 'STATIC', ':': 'STATIC', '//www.google.com': 'VARIABLE'}
