# Data Exploration
Visualizing the training data used for the log classifier.

In [1]:
import json
import pandas as pd
from collections import Counter

In [2]:
# Load the data
with open('../data/logs_with_embeddings.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame without embeddings
df = pd.DataFrame([{'text': d['text'], 'label': d['label']} for d in data])
print(f"Total samples: {len(df)}")

Total samples: 100000


## Label Distribution

In [3]:
label_counts = df['label'].value_counts()
print("Label distribution:")
print(label_counts)
print(f"\nPercentages:")
print(label_counts / len(df) * 100)

Label distribution:
label
0    90000
1    10000
Name: count, dtype: int64

Percentages:
label
0    90.0
1    10.0
Name: count, dtype: float64


## Sample Examples by Label

In [4]:
print("=" * 80)
print("NORMAL (label=0) examples:")
print("=" * 80)
for i, text in enumerate(df[df['label'] == 0]['text'].head(10)):
    print(f"\n[{i+1}] {text}")

NORMAL (label=0) examples:

[1] generating core.35495

[2] generating core.13236

[3] 152008532 double-hummer alignment exceptions

[4] 113327424 double-hummer alignment exceptions

[5] iar 003a9260 dear 012900b8

[6] generating core.37013

[7] Node card VPD check: U01 node in processor card slot J16 do not match. VPD ecid 04DD80A8152FFFFF020A1BD04EE3, found 0000000000000000000000000000

[8] Kernel detected 27092945 integer alignment exceptions (27092938) iar 0x00265594, dear 0x1feaa260 (27092939) iar 0x00265598, dear 0x1feaa280 (27092940) iar 0x0023f0e0, dear 0x1feaa1e0 (27092941) iar 0x0023f0e8, dear 0x1feaa200 (27092942) iar 0x0023f0f0, dear 0x1feaa220 (27092943) iar 0x0023f0f8, dear 0x1feaa240 (27092944) iar 0x0023f100, dear 0x1feaa260 (27092945) iar 0x0023f108, dear 0x1feaa280

[9] 1363399648 double-hummer alignment exceptions

[10] 1393946368 double-hummer alignment exceptions


In [5]:
print("=" * 80)
print("ANOMALY (label=1) examples:")
print("=" * 80)
for i, text in enumerate(df[df['label'] == 1]['text'].head(10)):
    print(f"\n[{i+1}] {text}")

ANOMALY (label=1) examples:

[1] ciod: Error reading message prefix on CioStream socket to 172.16.96.116:46094, Link has been severed

[2] ciod: Error reading message prefix on CioStream socket to 172.16.96.116:60157, Link has been severed

[3] ciod: Error reading message prefix after LOGIN_MESSAGE on CioStream socket to 172.16.96.116:39451: Link has been severed

[4] Lustre mount FAILED : bglio30 : block_id : location

[5] ciod: Error reading message prefix after LOAD_MESSAGE on CioStream socket to 172.16.96.116:56397: Link has been severed

[6] ciod: Error reading message prefix after LOGIN_MESSAGE on CioStream socket to 172.16.96.116:33281: Link has been severed

[7] ciod: Error reading message prefix after LOGIN_MESSAGE on CioStream socket to 172.16.96.116:43372: Link has been severed

[8] ciod: Error reading message prefix after LOGIN_MESSAGE on CioStream socket to 172.16.96.116:59887: Link has been severed

[9] ciod: Error reading message prefix after LOAD_MESSAGE on CioStream so

## Text Length Analysis

In [6]:
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text length statistics:")
print(df.groupby('label')['text_length'].describe())
print("\nWord count statistics:")
print(df.groupby('label')['word_count'].describe())

Text length statistics:
         count        mean        std   min    25%    50%    75%    max
label                                                                  
0      90000.0   74.027433  96.690044   2.0   26.0   44.0   61.0  838.0
1      10000.0  105.215700  16.327842  24.0  100.0  100.0  119.0  180.0

Word count statistics:
         count       mean        std  min   25%   50%   75%   max
label                                                            
0      90000.0   8.708467  11.645960  1.0   4.0   4.0  10.0  93.0
1      10000.0  14.410800   1.799324  3.0  14.0  14.0  16.0  21.0


## Most Common Words by Label

In [7]:
def get_common_words(texts, n=20):
    all_words = ' '.join(texts).lower().split()
    return Counter(all_words).most_common(n)

print("Most common words in NORMAL logs:")
for word, count in get_common_words(df[df['label'] == 0]['text']):
    print(f"  {word}: {count}")

print("\nMost common words in ANOMALY logs:")
for word, count in get_common_words(df[df['label'] == 1]['text']):
    print(f"  {word}: {count}")

Most common words in NORMAL logs:
  iar: 58317
  dear: 58316
  alignment: 43621
  exceptions: 43621
  double-hummer: 34425
  generating: 18992
  detected: 14419
  0x00544ea8,: 11582
  0x00544eb8,: 11582
  0x00544ee0,: 11582
  0x00544ef0,: 11582
  and: 7243
  corrected: 7067
  kernel: 6517
  integer: 6516
  error(s): 5758
  over: 5703
  seconds: 5689
  ddr: 5134
  on: 4789

Most common words in ANOMALY logs:
  to: 10763
  message: 9541
  on: 9489
  ciod:: 9482
  prefix: 9479
  socket: 9479
  error: 8209
  reading: 8195
  ciostream: 8195
  link: 7357
  has: 7357
  been: 7357
  severed: 7357
  after: 4091
  load_message: 2536
  failed: 1694
  control: 1285
  read: 1284
  stream: 1284
  (ciostream: 1284


## Check for Duplicate Texts

In [8]:
duplicates = df[df.duplicated(subset=['text'], keep=False)]
print(f"Duplicate texts in dataset: {len(duplicates)}")
if len(duplicates) > 0:
    print("\nExamples of duplicates:")
    print(duplicates.head(10))

Duplicate texts in dataset: 0


## Random Sample of Each Class

In [9]:
print("Random NORMAL samples:")
print("-" * 80)
for text in df[df['label'] == 0].sample(5, random_state=42)['text']:
    print(f"  {text}\n")

print("Random ANOMALY samples:")
print("-" * 80)
for text in df[df['label'] == 1].sample(5, random_state=42)['text']:
    print(f"  {text}\n")

Random NORMAL samples:
--------------------------------------------------------------------------------
  257652970 double-hummer alignment exceptions

  generating core.41652

  24:00adb1b8 25:00000015 26:00240000 27:00230000

  generating core.56773

  36879638 double-hummer alignment exceptions

Random ANOMALY samples:
--------------------------------------------------------------------------------
  ciod: Error reading message prefix after LOAD_MESSAGE on CioStream socket to 172.16.96.116:47497: Link has been severed

  ciod: failed to read message prefix on control stream (CioStream socket to 172.16.96.116:51695

  ciod: Error reading message prefix after LOAD_MESSAGE on CioStream socket to 172.16.96.116:58282: Link has been severed

  ciod: Error reading message prefix after LOGIN_MESSAGE on CioStream socket to 172.16.96.116:58968: Link has been severed

  ciod: Error reading message prefix on CioStream socket to 172.16.96.116:52742, Connection reset by peer

