# Data Exploration
Visualizing the training data used for the log classifier.

In [None]:
import json
import pandas as pd
from collections import Counter

In [None]:
# Load the data
with open('../data/logs_with_embeddings.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame without embeddings
df = pd.DataFrame([{'text': d['text'], 'label': d['label']} for d in data])
print(f"Total samples: {len(df)}")

## Label Distribution

In [None]:
label_counts = df['label'].value_counts()
print("Label distribution:")
print(label_counts)
print(f"\nPercentages:")
print(label_counts / len(df) * 100)

## Sample Examples by Label

In [None]:
print("=" * 80)
print("NORMAL (label=0) examples:")
print("=" * 80)
for i, text in enumerate(df[df['label'] == 0]['text'].head(10)):
    print(f"\n[{i+1}] {text}")

In [None]:
print("=" * 80)
print("ANOMALY (label=1) examples:")
print("=" * 80)
for i, text in enumerate(df[df['label'] == 1]['text'].head(10)):
    print(f"\n[{i+1}] {text}")

## Text Length Analysis

In [None]:
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text length statistics:")
print(df.groupby('label')['text_length'].describe())
print("\nWord count statistics:")
print(df.groupby('label')['word_count'].describe())

## Most Common Words by Label

In [None]:
def get_common_words(texts, n=20):
    all_words = ' '.join(texts).lower().split()
    return Counter(all_words).most_common(n)

print("Most common words in NORMAL logs:")
for word, count in get_common_words(df[df['label'] == 0]['text']):
    print(f"  {word}: {count}")

print("\nMost common words in ANOMALY logs:")
for word, count in get_common_words(df[df['label'] == 1]['text']):
    print(f"  {word}: {count}")

## Check for Duplicate Texts

In [None]:
duplicates = df[df.duplicated(subset=['text'], keep=False)]
print(f"Duplicate texts in dataset: {len(duplicates)}")
if len(duplicates) > 0:
    print("\nExamples of duplicates:")
    print(duplicates.head(10))

## Random Sample of Each Class

In [None]:
print("Random NORMAL samples:")
print("-" * 80)
for text in df[df['label'] == 0].sample(5, random_state=42)['text']:
    print(f"  {text}\n")

print("Random ANOMALY samples:")
print("-" * 80)
for text in df[df['label'] == 1].sample(5, random_state=42)['text']:
    print(f"  {text}\n")