In [7]:
from typing import *
import random
import h5py
import json

# TypedDict for Ciena's log data
class CienaLogData(TypedDict):
    """
    - dup_id: str, String that is the same for bugs that are the same
    - event_id: str, Unique string per bug_id
    - group_id: str, Group to put logs related together, 'true' class (found by CIENA) of algorithm
    - line_num: str, Plan id of the log: with log_name constitute the build_log
    - planid: str, Plan id of the log: with log_name constitute the build_log
    - log_name: str, Second part to make the build log
    - raw: str, Raw text of the bug
    - template: str, Template found by CIENA
    - variables: List[str], Variables found with the template of CIENA
    """

    dup_id: str
    event_id: str
    group_id: str
    line_num: str
    planid: str
    log_name: str
    raw: str
    template: str
    text: str
    variables: List[str]

EventId = str
SplitDict = Dict[str, List[EventId]]

# Create dict of splits
with open("CienaFiles/splitted_event_ids.json") as fp:
    splits: SplitDict = json.load(fp)
    print(type(splits)
    )

<class 'dict'>


In [47]:
# Runs over all the given file from Ciena

with h5py.File("CienaFiles/trat3_production_1650_1700_20231411_v1.hdf5") as fp:
    for logfile_name, list_event_ids in splits.items():
        for event_id in list_event_ids:
            dico_attrs: CienaLogData = {**fp[event_id].attrs}

In [13]:
## Gets random items from a given number of splits

log_list = []
num_splits = 1

for i in range(num_splits):
    logfile_names, event_list = random.choice(list(splits.items()))
    with h5py.File("CienaFiles/trat3_production_1650_1700_20231411_v1.hdf5") as fp:
        for event_id in event_list:
            dico_attrs: CienaLogData = {**fp[event_id].attrs}
            log_list.append(dico_attrs['template'])

print(log_list)

['fail', 'fail', 'cp cannot stat no such file or directory', 'rm cannot remove no such file or directory', 'error declaration of c function conflicts with', 'error previous declaration here', 'tput no value for term and no t specified', 'tput no value for term and no t specified', 'cat no such file or directory', 'cat no such file or directory', 'tput no value for term and no t specified', 'compile failed using', 'cat no such file or directory', 'error previous declaration here', 'mv cannot stat no such file or directory', 'error declaration of c function conflicts with', 'cp cannot stat no such file or directory', 'tput no value for term and no t specified', 'cp cannot stat no such file or directory', 'invalid suffix on integer constant', 'tput no value for term and no t specified', 'cp cannot stat no such file or directory', 'cp cannot stat no such file or directory', 'cat no such file or directory', 'cp cannot stat no such file or directory', 'tput no value for term and no t specifi

In [1]:
import scipy as sp

mat = sp.sparse.random(5, 5, density=0.25)

In [6]:
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

print(pairwise_distances(np.asarray(mat.todense()), metric='jaccard'))

[[0.         0.66666667 1.         1.         0.66666667]
 [0.66666667 0.         1.         1.         0.        ]
 [1.         1.         0.         1.         1.        ]
 [1.         1.         1.         0.         1.        ]
 [0.66666667 0.         1.         1.         0.        ]]




In [7]:
# Code for Jaccard Distance
def jaccard_distance(mat):
    rows_sum = mat.getnnz(axis=1)
    ab = mat * mat.T

    # For rows
    aa = np.repeat(rows_sum, ab.getnnz(axis=1))
    # For columns
    bb = rows_sum[ab.indices]

    # Calculates Jaccard similarity
    similarities = ab.copy()
    similarities.data = similarities.data/(aa + bb - ab.data)
    

    # Calculates Jaccard distance
    distance = 1 - similarities.todense()

    return distance

print(jaccard_distance(mat))

[[0.73339976 0.98318573 1.         1.         0.77182262]
 [0.98318573 0.99741624 1.         1.         0.97018277]
 [1.         1.         1.         1.         1.        ]
 [1.         1.         1.         0.75389868 1.        ]
 [0.77182262 0.97018277 1.         1.         0.51786284]]
