In [4]:
import random
from datetime import datetime, timedelta

# Generate random timestamps with milliseconds difference
def random_timestamp(start, max_millisec_increment):
    increment = timedelta(milliseconds=random.randint(1, max_millisec_increment))
    return start + increment

# Generate fictitious log lines with timestamps
def generate_log_line(i, timestamp):
    events = [
        "User logged in",
        "Session started successfully",
        "Connection error",
        "Reconnection attempt",
        "Database query",
        "Database updated",
        "Security alert issued",
        "Backup started",
        "Backup completed",
        "Configuration file loaded",
        "User logged out",
        "Apache server restarted",
        "RAM memory exceeding limit",
        "Low disk space",
        "Cache cleared successfully"
    ]
    event = random.choice(events)
    return f"{timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} - {event} [ID: {i}]"

# Define timestamp start
start_time = datetime(2023, 1, 1, 0, 0, 0)

# Create 200 log lines with timestamps spaced by milliseconds
num_lines = 200
logs = []
current_time = start_time
for i in range(num_lines):
    current_time = random_timestamp(current_time, 5000)  # max 5 seconds increment (5000 ms)
    logs.append(generate_log_line(i, current_time))

# Insert explicit causal pairs with dynamic line difference
for i in range(0, num_lines - 10, 20):  # insert a causal pair every 20 lines
    causal_gap = random.randint(1, 5)  # causal event occurs 1 to 5 lines later
    logs[i] = f"{logs[i][:23]} - User logged in [ID: {i}]"
    logs[i + causal_gap] = f"{logs[i + causal_gap][:23]} - Session started successfully [ID: {i+causal_gap}]"

# Save log file
with open("logs.log", "w") as f:
    for log_entry in logs:
        f.write(log_entry + "\n")

print("Log file successfully generated in logs.log")


Log file successfully generated in logs.log


In [1]:
import random
import string
NUM_SEQUENCES = 100  # scale up
EVENT_POOL = list(string.ascii_uppercase)  # Events A-Z
# Causal chains
causal_rules = [
    ['A', 'B', 'C'],
    ['D', 'E'],
    ['F', 'G', 'H'],
]
# Common cause pattern: H → I and H → J (but no direct I :flecha_esquerda_direita: J)
common_causes = [('H', ['I', 'J'])]
# Spurious reversed causality: J appears before I
reversed_pairs = [('K', 'L')]  # L appears before K in sequences
# Independent noisy events
noise_events = ['X', 'Y', 'Z']
def generate_sequence():
    sequence = []
    # Real causal chains
    for chain in causal_rules:
        sub = []
        for e in chain:
            if random.random() > 0.1:  # 10% chance to drop event
                sub.append(e)
        sequence.extend(sub)
    # Common cause-based correlations
    for cause, effects in common_causes:
        sequence.append(cause)
        for e in effects:
            if random.random() > 0.05:
                sequence.append(e)
    # Spurious reversed causality
    for a, b in reversed_pairs:
        sequence.append(b)
        sequence.append(a)
    # Add noise
    for _ in range(random.randint(1, 3)):
        sequence.append(random.choice(noise_events))
    random.shuffle(sequence)
    return sequence
# Save to file
with open("synthetic_sequences_large.txt", "w") as f:
    for _ in range(NUM_SEQUENCES):
        seq = generate_sequence()
        f.write(' '.join(seq) + '\n')

In [2]:
# With text simulating real data

import random
import string
from datetime import datetime, timedelta

NUM_SEQUENCES = 100  # Número de logs completos (altere conforme necessário)

# Map events A-Z to realistic build logs
event_to_log = {
    'A': 'Scanning dependencies of target myapp',
    'B': 'Building CXX object src/CMakeFiles/myapp.dir/main.cpp.o',
    'C': 'Linking CXX executable bin/myapp',
    'D': 'Scanning dependencies of target utils',
    'E': 'Building CXX object src/CMakeFiles/utils.dir/utils.cpp.o',
    'F': 'Scanning dependencies of target logger',
    'G': 'Building CXX object src/CMakeFiles/logger.dir/logger.cpp.o',
    'H': 'Linking CXX static library lib/liblogger.a',
    'I': 'Building CXX object src/CMakeFiles/extra.dir/extra1.cpp.o',
    'J': 'Building CXX object src/CMakeFiles/extra.dir/extra2.cpp.o',
    'K': 'Building CXX object src/CMakeFiles/test.dir/test.cpp.o',
    'L': 'Running tests...',
    'M': 'Generating documentation with Doxygen',
    'N': 'Packaging project into tar.gz',
    'O': 'Copying resources to bin/',
    'P': 'Building man pages',
    'Q': 'Checking code style with clang-format',
    'R': 'Building CXX object src/CMakeFiles/feature.dir/feature.cpp.o',
    'S': 'Running static code analysis',
    'T': 'Creating version header',
    'U': 'Stripping binaries for size optimization',
    'V': 'Archiving object files',
    'W': 'Creating symlinks to shared libraries',
    'X': '[INFO] Build environment: gcc 12.1, Ubuntu 22.04',
    'Y': '[DEBUG] Cache hit for module config',
    'Z': '[WARN] Deprecated API used in utils.cpp:23',
}

# Causal rules
causal_rules = [
    ['A', 'B', 'C'],
    ['D', 'E'],
    ['F', 'G', 'H'],
]

# Common cause pattern: H → I and H → J (but no direct I → J)
common_causes = [('H', ['I', 'J'])]

# Spurious reversed causality: J appears before I
reversed_pairs = [('K', 'L')]  # L antes de K

# Independent noisy events
noise_events = ['X', 'Y', 'Z']

# Creates a sequence of events according to the rules
def generate_sequence():
    sequence = []
    # Causal chains
    for chain in causal_rules:
        sub = []
        for e in chain:
            if random.random() > 0.1:  # 10% chance to drop event
                sub.append(e)
        sequence.extend(sub)
    # Common cause
    for cause, effects in common_causes:
        sequence.append(cause)
        for e in effects:
            if random.random() > 0.05:
                sequence.append(e)
    # Reversed causality
    for a, b in reversed_pairs:
        sequence.append(b)
        sequence.append(a)
    # Noise
    for _ in range(random.randint(1, 3)):
        sequence.append(random.choice(noise_events))
    random.shuffle(sequence)
    return sequence

# Converts letters to logs with growing timestamps
def sequence_to_timestamped_logs(sequence, base_time=None):
    if base_time is None:
        base_time = datetime.now().replace(microsecond=0)
    current_time = base_time
    logs = []
    for event in sequence:
        delta = timedelta(milliseconds=random.randint(1, 100))
        current_time += delta
        timestamp = current_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
        log_line = f"{timestamp} {event_to_log[event]}"
        logs.append(log_line)
    return logs

# Save to file
with open("synthetic_sequences.txt", "w") as f:
    for _ in range(NUM_SEQUENCES):
        seq = generate_sequence()
        logs = sequence_to_timestamped_logs(seq)
        f.write('; '.join(logs) + '\n')


In [None]:
import random
from datetime import datetime, timedelta

NUM_SEQUENCES = 100  # Número de logs completos
## setting the number of lines as a parameter - initial 400 different templates
## number of lines - 200k
## fix the repeating timestamps
## comparing the causalities with PC, other methods


# Probability parameters (between 0 e 1)
p_causal = 0.9       # Real causal links (ex: A → B → C)
p_common = 0.95      # Common cause (H → I and H → J)
p_spurious = 0.7     # Spurious causality (L → K)
p_noise = 0.5        # Probability of including noise event (per event)

# Map events A-Z to realistic build logs
event_to_log = {
    'A': 'Scanning dependencies of target myapp',
    'B': 'Building CXX object src/CMakeFiles/myapp.dir/main.cpp.o',
    'C': 'Linking CXX executable bin/myapp',
    'D': 'Scanning dependencies of target utils',
    'E': 'Building CXX object src/CMakeFiles/utils.dir/utils.cpp.o',
    'F': 'Scanning dependencies of target logger',
    'G': 'Building CXX object src/CMakeFiles/logger.dir/logger.cpp.o',
    'H': 'Linking CXX static library lib/liblogger.a',
    'I': 'Building CXX object src/CMakeFiles/extra.dir/extra1.cpp.o',
    'J': 'Building CXX object src/CMakeFiles/extra.dir/extra2.cpp.o',
    'K': 'Building CXX object src/CMakeFiles/test.dir/test.cpp.o',
    'L': 'Running tests...',
    'M': 'Generating documentation with Doxygen',
    'N': 'Packaging project into tar.gz',
    'O': 'Copying resources to bin/',
    'P': 'Building man pages',
    'Q': 'Checking code style with clang-format',
    'R': 'Building CXX object src/CMakeFiles/feature.dir/feature.cpp.o',
    'S': 'Running static code analysis',
    'T': 'Creating version header',
    'U': 'Stripping binaries for size optimization',
    'V': 'Archiving object files',
    'W': 'Creating symlinks to shared libraries',
    'X': '[INFO] Build environment: gcc 12.1, Ubuntu 22.04',
    'Y': '[DEBUG] Cache hit for module config',
    'Z': '[WARN] Deprecated API used in utils.cpp:23',
}

# Rules
causal_rules = [['A', 'B', 'C'], ['D', 'E'], ['F', 'G', 'H']]
common_causes = [('H', ['I', 'J'])]
reversed_pairs = [('K', 'L')]
noise_events = ['X', 'Y', 'Z']

def generate_sequence():
    sequence = []

    # Causal chains
    for chain in causal_rules:
        if random.random() < p_causal:
            sub = []
            for e in chain:
                if random.random() < p_causal:
                    sub.append(e)
            sequence.extend(sub)

    # Common cause
    for cause, effects in common_causes:
        if random.random() < p_common:
            sequence.append(cause)
            for e in effects:
                if random.random() < p_common:
                    sequence.append(e)

    # Spurious reversed causality
    for a, b in reversed_pairs:
        if random.random() < p_spurious:
            sequence.append(b)  # reversed on purpose
            sequence.append(a)

    # Noise
    for e in noise_events:
        if random.random() < p_noise:
            sequence.append(e)

    random.shuffle(sequence)
    return sequence

def sequence_to_timestamped_logs(sequence, base_time=None):
    if base_time is None:
        base_time = datetime.now().replace(microsecond=0)
    current_time = base_time
    logs = []
    for event in sequence:
        delta = timedelta(milliseconds=random.randint(1, 100))
        current_time += delta
        timestamp = current_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
        log_line = f"{timestamp} {event_to_log[event]}"
        logs.append(log_line)
    return logs

# Save to file
with open("synthetic_sequences.txt", "w") as f:
    for _ in range(NUM_SEQUENCES):
        seq = generate_sequence()
        logs = sequence_to_timestamped_logs(seq)
        f.write('; '.join(logs) + '\n')


In [4]:
import random
from datetime import datetime

# Number of sequences
NUM_SEQUENCES = 200000

# Probability parameters
p_causal   = 0.9
p_common   = 0.95
p_spurious = 0.7
p_noise    = 0.5

# Define 100 distinct actions
actions = [
    "Scanning dependencies of target",
    "Building CXX object",
    "Linking CXX executable",
    "Linking CXX static library",
    "Running unit tests",
    "Running integration tests",
    "Generating documentation with Doxygen",
    "Packaging project into tar.gz",
    "Copying resources to bin/",
    "Checking code style with clang-format",
    "Running static code analysis",
    "Stripping binaries for size optimization",
    "Archiving object files",
    "Creating symlinks to shared libraries",
    "Cleaning build directory",
    "Deploying to staging environment",
    "Starting CI job",
    "Stopping services",
    "Initializing submodule",
    "Updating submodules",
    "Downloading artifacts",
    "Uploading logs",
    "Trying to compile",
    "Generating new compilation file for",
    "Checking linking errors",
    "Monitoring memory usage",
    "Starting performance profiling",
    "Stopping performance profiling",
    "Verifying digital signatures",
    "Signing build artifact",
    "Encrypting backup",
    "Decrypting configuration",
    "Validating JSON schemas",
    "Formatting source code",
    "Optimizing binary size",
    "Compressing output files",
    "Decompressing archives",
    "Installing dependencies",
    "Uninstalling old versions",
    "Registering components",
    "Deregistering services",
    "Synchronizing build cache",
    "Purging temporary files",
    "Uploading build metrics",
    "Reporting coverage statistics",
    "Generating code coverage report",
    "Running security scan",
    "Uploading to artifact repository",
    "Downloading container image",
    "Building Docker image",
    "Pushing Docker image",
    "Pulling base image",
    "Executing post-build script",
    "Checking environment variables",
    "Setting up build environment",
    "Tearing down build environment",
    "Merging build branches",
    "Rebasing changes",
    "Committing changes",
    "Pushing commits to remote",
    "Fetching updates from origin",
    "Cloning repository",
    "Checking out branch",
    "Creating pull request",
    "Merging pull request",
    "Closing pull request",
    "Running smoke tests",
    "Running regression tests",
    "Running performance tests",
    "Validating API responses",
    "Starting microservice",
    "Stopping microservice",
    "Connecting to database",
    "Migrating database schema",
    "Seeding database",
    "Backing up database",
    "Restoring database",
    "Validating database integrity",
    "Checking network connectivity",
    "Initializing Docker container",
    "Removing Docker container",
    "Starting virtual machine",
    "Stopping virtual machine",
    "Mounting file system",
    "Unmounting file system",
    "Checking disk space",
    "Cleaning package cache",
    "Updating package index",
    "Installing system packages",
    "Removing orphaned packages",
    "Generating changelog",
    "Tagging release",
    "Building release branch",
    "Publishing release notes",
    "Sending notification email",
    "Triggering webhook",
    "Restarting service",
    "Verifying checksum",
    "Calculating dependency tree",
    "Generating UML diagrams"
]

# 20 artifact patterns
artifact_patterns = [
    "module_{n}",
    "component_{n}.dir/file_{n}.cpp.o",
    "output_{n}",
    "index_{n}.html",
    "lib_{n}.a",
    "config_{n}.yaml",
    "build_{n}.sh",
    "test_suite_{n}.xml",
    "pipeline_{n}.yml",
    "build_{n}.log",
    "header_{n}.h",
    "libcomponent_{n}.so",
    "env_{n}",
    "service_{n}",
    "service_{n}.log",
    "tool_{n}",
    "module_obj_{n}.o",
    "cache_{n}.tmp",
    "image_{n}",
    "package_{n}.tar.gz"
]

# Generate all action-artifact pairs and select 400
all_pairs = [
    (act, pattern, art_idx)
    for art_idx, pattern in enumerate(artifact_patterns, start=1)
    for act in actions
]
selected = random.sample(all_pairs, 400)

# Map "1".."400" to chosen templates
event_to_log = {
    str(idx): f"{act} {pattern.format(n=art_idx)}"
    for idx, (act, pattern, art_idx) in enumerate(selected, start=1)
}

# Expanded rules
causal_rules = [
    [str(i), str(i+1), str(i+2)]
    for i in range(1, 121, 3)
]
common_causes = [
    (str(i), [str(i+1), str(i+2), str(i+3)])
    for i in range(121, 201, 2)
]
reversed_pairs = [
    (str(i), str(i+1))
    for i in range(201, 281, 2)
]
noise_events = [str(i) for i in range(281, 321)]


def generate_sequence():
    seq = []

    # causal chains
    for chain in causal_rules:
        if random.random() < p_causal:
            for e in chain:
                if random.random() < p_causal:
                    seq.append(e)

    # common cause
    for cause, effects in common_causes:
        if random.random() < p_common:
            seq.append(cause)
            for e in effects:
                if random.random() < p_common:
                    seq.append(e)

    # spurious reversed
    for a, b in reversed_pairs:
        if random.random() < p_spurious:
            seq.extend([b, a])

    # noise
    for e in noise_events:
        if random.random() < p_noise:
            seq.append(e)

    random.shuffle(seq)
    return seq


def sequence_to_timestamped_line(seq):
    # dynamic length 10–20
    target_len = random.randint(10, 20)
    if len(seq) > target_len:
        seq = random.sample(seq, target_len)
    else:
        keys = list(event_to_log.keys())
        while len(seq) < target_len:
            seq.append(random.choice(keys))

    timestamp = datetime.now().replace(microsecond=0).strftime("%Y-%m-%d %H:%M:%S")
    messages = '; '.join(event_to_log[e] for e in seq)
    return f"{timestamp} {messages}"

# Write out
with open("synthetic_sequences.txt", "w") as f:
    for _ in range(NUM_SEQUENCES):
        line = sequence_to_timestamped_line(generate_sequence())
        f.write(line + "\n")


In [3]:
import random
from datetime import datetime

NUM_SEQUENCES = 200000

p_causal   = 0.9
p_common   = 0.95
p_spurious = 0.7
p_noise    = 0.5

# Ações detalhadas realistas para maior variabilidade
actions = [
    "Cloning repository", "Checking out branch", "Installing dependencies", "Verifying dependency integrity",
    "Configuring build environment", "Building CXX object", "Building Java classes", "Compiling TypeScript files",
    "Linking CXX executable", "Linking shared libraries", "Running unit tests", "Running integration tests",
    "Generating code coverage report", "Reporting test results", "Checking code style", "Running linter checks",
    "Analyzing static code", "Creating version header", "Compressing log files", "Uploading logs to S3",
    "Triggering webhook", "Sending Slack notification", "Deploying to staging environment", "Deploying to production",
    "Validating deployment", "Running smoke tests", "Starting CI job", "Finalizing CI job",
    "Pushing Docker image", "Pulling Docker base image", "Tagging Docker image", "Cleaning Docker cache",
    "Encrypting artifacts", "Decrypting configuration files", "Backing up database", "Restoring database",
    "Migrating database schema", "Seeding database", "Checking database connectivity", "Restarting database service",
    "Verifying checksums", "Running security scan", "Patching vulnerabilities", "Reviewing dependencies",
    "Archiving build artifacts", "Publishing artifacts to repository", "Generating documentation with Doxygen",
    "Converting markdown to HTML", "Publishing site", "Notifying release manager", "Syncing with GitHub",
    "Merging pull request", "Rebasing branch", "Creating release notes", "Signing release packages",
    "Verifying digital signatures", "Pushing changes to remote", "Creating git tag", "Verifying git tag signature",
    "Running regression tests", "Running performance tests", "Measuring memory usage", "Analyzing CPU usage",
    "Formatting source code", "Optimizing assets", "Uploading sourcemaps", "Tracking build metrics",
    "Analyzing historical trends", "Creating Jira ticket", "Logging build statistics", "Sending report via email",
    "Scanning for secrets in code", "Checking disk space", "Checking available memory", "Monitoring build agents",
    "Syncing mirrors", "Rebooting build server", "Rebuilding failed jobs", "Creating new pipeline configuration",
    "Triggering downstream jobs", "Validating Kubernetes manifests", "Deploying Helm charts",
    "Verifying service health checks", "Logging health status", "Generating system diagnostics",
    "Uploading crash reports", "Restarting failed containers", "Rebalancing workloads",
    "[INFO] Build completed successfully", "[ERROR] Linker returned non-zero exit code",
    "[WARN] Deprecated function used", "[DEBUG] Entering compilation loop", "[FATAL] Out of memory during linking stage",
    "[TRACE] Dependency chain resolved"
]

# Mais padrões de artefatos para aumentar diversidade
artifact_patterns = [
    "module_{n}", "component_{n}.dir/file_{n}.cpp.o", "output_{n}", "index_{n}.html", "lib_{n}.a",
    "config_{n}.yaml", "build_{n}.sh", "test_suite_{n}.xml", "pipeline_{n}.yml", "build_{n}.log",
    "header_{n}.h", "libcomponent_{n}.so", "env_{n}", "service_{n}", "tool_{n}", "module_obj_{n}.o",
    "cache_{n}.tmp", "image_{n}", "package_{n}.tar.gz", "resource_{n}.json", "docs_{n}.md",
    "logfile_{n}.txt", "results_{n}.json", "job_{n}.status", "chart_{n}.yaml", "metrics_{n}.csv",
    "secrets_{n}.env", "manifest_{n}.yaml", "analysis_{n}.rpt", "coverage_{n}.html",
    "alert_{n}.json", "summary_{n}.txt", "buildspec_{n}.yml", "helm_{n}.tgz", "crashdump_{n}.log",
    "trace_{n}.xml", "snapshot_{n}.img", "release_notes_{n}.md", "tasklog_{n}.txt"
]

# Selecionar 400 combinações distintas
all_pairs = [
    (act, pattern, art_idx)
    for art_idx, pattern in enumerate(artifact_patterns, start=1)
    for act in actions
]
selected = random.sample(all_pairs, 400)

event_to_log = {
    str(idx): f"{act} {pattern.format(n=art_idx)}"
    for idx, (act, pattern, art_idx) in enumerate(selected, start=1)
}

# Cadeias causais realistas ampliadas
causal_chains_realistic = [
    ["Cloning repository", "Checking out branch", "Installing dependencies"],
    ["Installing dependencies", "Configuring build environment", "Building CXX object"],
    ["Building CXX object", "Linking CXX executable", "Running unit tests"],
    ["Running unit tests", "Generating code coverage report", "Reporting test results"],
    ["Running integration tests", "Validating deployment", "Running smoke tests"],
    ["Migrating database schema", "Seeding database", "Checking database connectivity"],
    ["Backing up database", "Restoring database", "Verifying checksums"],
    ["Deploying to staging environment", "Deploying to production", "Validating deployment"],
    ["Packaging project into tar.gz", "Uploading logs to S3", "Triggering webhook"],
    ["Starting CI job", "Finalizing CI job", "Notifying release manager"]
]

# Mapear essas a chaves de event_to_log
reverse_lookup = {v: k for k, v in event_to_log.items()}
causal_rules = []
for chain in causal_chains_realistic:
    try:
        causal_rules.append([reverse_lookup[s] for s in chain])
    except KeyError:
        continue

common_causes = []
if "Cloning repository" in reverse_lookup:
    c = reverse_lookup["Cloning repository"]
    effects = [v for k, v in event_to_log.items() if "Installing" in v or "Checking out" in v or "Building" in v]
    common_causes.append((c, [k for k, v in event_to_log.items() if v in effects][:4]))

reversed_pairs = []
for a, b in [("Running integration tests", "Deploying to production"),
             ("Uploading logs to S3", "Triggering webhook"),
             ("Generating documentation with Doxygen", "Publishing site"),
             ("Migrating database schema", "Backing up database")]:
    if a in reverse_lookup and b in reverse_lookup:
        reversed_pairs.append((reverse_lookup[b], reverse_lookup[a]))

noise_events = [k for k, v in event_to_log.items() if any(err in v for err in ["[ERROR]", "[WARN]", "[DEBUG]", "[FATAL]", "[TRACE]"])]

def generate_sequence():
    seq = []
    for chain in causal_rules:
        if random.random() < p_causal:
            for e in chain:
                if random.random() < p_causal:
                    seq.append(e)
    for cause, effects in common_causes:
        if random.random() < p_common:
            seq.append(cause)
            for e in effects:
                if random.random() < p_common:
                    seq.append(e)
    for a, b in reversed_pairs:
        if random.random() < p_spurious:
            seq.extend([b, a])
    for e in noise_events:
        if random.random() < p_noise:
            seq.append(e)
    random.shuffle(seq)
    return seq

def sequence_to_timestamped_line(seq):
    target_len = random.randint(10, 20)
    if len(seq) > target_len:
        seq = random.sample(seq, target_len)
    else:
        keys = list(event_to_log.keys())
        while len(seq) < target_len:
            seq.append(random.choice(keys))
    timestamp = ""
    messages = '\n'.join(event_to_log[e] for e in seq)
    return f"<START>\n{timestamp}\n{messages}\n<END>"

with open("synthetic_sequences.txt", "w") as f:
    for _ in range(NUM_SEQUENCES):
        line = sequence_to_timestamped_line(generate_sequence())
        f.write(line + "\n")
