In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from causallearn.search.ConstraintBased.PC import pc
from causallearn.utils.cit import chisq
import numpy as np

def read_log_lines(log_path):
    with open(log_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def build_windowed_dataset(log_lines, window_size=5):
    windows = []
    for i in range(len(log_lines) - window_size + 1):
        window = log_lines[i:i+window_size]
        windows.append(set(window))  # evita repetição dentro da janela

    mlb = MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(windows)
    df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
    return df, mlb.classes_

def run_pc_and_export_csv(df, variable_names, output_path="pc_result.csv", alpha=0.01):
    X = df.to_numpy().astype(int)
    cg = pc(data=X, alpha=alpha, indep_test="chisq", uc_rule=0, verbose=False)

    edges = []
    n_vars = len(variable_names)
    for i in range(n_vars):
        for j in range(n_vars):
            if cg.G.graph[i, j] == -1 and cg.G.graph[j, i] == 1:
                # i → j
                edges.append((variable_names[i], variable_names[j]))

    df_edges = pd.DataFrame(edges, columns=["source", "target"])
    df_edges.to_csv(output_path, index=False)
    print(f"Arquivo salvo em: {output_path} ({len(df_edges)} relações causais)")
    return df_edges


# === Configurações ===
log_path = "logs/logs_teste.log"  # exemplo: "/home/user/meu_log.log"
window_size = 5
alpha = 0.01
output_path = "resultadosPC/causal_relations.csv"

# === Processamento ===
log_lines = read_log_lines(log_path)
df, variable_names = build_windowed_dataset(log_lines, window_size)
df_result = run_pc_and_export_csv(df, variable_names, output_path, alpha=alpha)

# Mostrar primeiras linhas
df_result.head()





In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from causallearn.search.ConstraintBased.PC import pc
from causallearn.utils.cit import chisq
from collections import Counter
import numpy as np

def read_log_lines(log_path):
    with open(log_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def choose_window_size(log_lines, frequent_templates, min_samples_per_feature=5, max_window=30):
    n_features = len(frequent_templates)
    n_lines = len(log_lines)

    if n_features == 0:
        raise ValueError("Nenhum evento com frequência suficiente.")

    for window_size in range(max_window, 1, -1):
        n_samples = n_lines - window_size + 1
        if n_samples >= min_samples_per_feature * n_features:
            return window_size
    return 2

def build_windowed_dataset(log_lines, window_size=None, min_freq=5):
    counts = Counter(log_lines)
    frequent_templates = {tpl for tpl, freq in counts.items() if freq >= min_freq}
    filtered_lines = [line for line in log_lines if line in frequent_templates]

    if len(frequent_templates) == 0:
        raise ValueError("Nenhum template sobreviveu ao filtro de frequência.")

    if window_size is None:
        window_size = choose_window_size(filtered_lines, frequent_templates)

    windows = []
    for i in range(len(filtered_lines) - window_size + 1):
        window = filtered_lines[i:i+window_size]
        windows.append(set(window))

    mlb = MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(windows)
    df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

    n_samples = df.shape[0]
    n_features = df.shape[1]
    ratio = n_samples / n_features if n_features else 0

    if ratio >= 10:
        confidence = "ALTA"
    elif ratio >= 5:
        confidence = "MODERADA"
    else:
        confidence = "BAIXA"

    return df, mlb.classes_, window_size, n_samples, n_features, ratio, confidence

def run_pc_and_export(df, variable_names, csv_path="pc_result.csv", report_path="pc_report.txt", alpha=0.01):
    X = df.to_numpy().astype(int)
    cg = pc(data=X, alpha=alpha, indep_test="chisq", uc_rule=0, verbose=False)

    directed = []
    bidirectional = []
    undirected = []

    n_vars = len(variable_names)
    for i in range(n_vars):
        for j in range(n_vars):
            if i == j:
                continue
            a = cg.G.graph[i, j]
            b = cg.G.graph[j, i]
            if a == -1 and b == 1:
                directed.append((variable_names[i], variable_names[j]))
            elif a == 1 and b == 1:
                bidirectional.append((variable_names[i], variable_names[j]))
            elif a == -1 and b == -1:
                undirected.append((variable_names[i], variable_names[j]))

    # Criar dataframe completo
    df_all = pd.DataFrame(directed + bidirectional + undirected,
                          columns=["source", "target"])
    df_all["relation"] = (["directed"] * len(directed) +
                          ["bidirectional"] * len(bidirectional) +
                          ["undirected"] * len(undirected))

    df_all.to_csv(csv_path, index=False)

    # Criar relatório
    with open(report_path, 'w') as f:
        f.write("=== Relatório de Causalidade ===\n")
        f.write(f"Total de variáveis analisadas: {n_vars}\n")
        f.write(f"Arestas direcionadas: {len(directed)}\n")
        f.write(f"Arestas bidirecionais: {len(bidirectional)}\n")
        f.write(f"Arestas não-direcionadas: {len(undirected)}\n")
        f.write(f"\nCSV salvo em: {csv_path}\n")

    print(f"💾 Arquivo CSV salvo em: {csv_path}")
    print(f"📝 Relatório salvo em: {report_path}")
    return df_all

log_path = "/caminho/para/seu/logfile.log"
csv_path = "relacoes_causais.csv"
report_path = "relatorio.txt"
alpha = 0.01

log_lines = read_log_lines(log_path)

# Montar dataset
df, variable_names, window_size, n_samples, n_features, ratio, confidence = build_windowed_dataset(
    log_lines, window_size=None, min_freq=5
)

print(f"📏 Window size escolhido: {window_size}")
print(f"📊 {n_samples} janelas × {n_features} eventos únicos → razão amostra/feature = {ratio:.2f}")
print(f"🔎 Nível de confiança estatística: {confidence}")

# Rodar algoritmo PC e salvar resultados
df_result = run_pc_and_export(
    df,
    variable_names,
    csv_path,
    report_path,
    alpha
)

df_result.head()
