In [1]:
import pandas as pd
import os
from datetime import datetime

# Input and output paths
folder_path = r'C:\Users\karun\OneDrive\Documents\RIK\data\TWOS-dataset\eventviewer_ano'
output_path = r'C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_eventviewer_summary.csv'

# Function to safely parse multiple date formats
def try_parse_date(x):
    for fmt in ['%d/%m/%y %H:%M:%S', '%m/%d/%y %H:%M:%S']:
        try:
            return datetime.strptime(x, fmt).date()
        except:
            continue
    return pd.NaT

# Collect parsed data from all .log files
all_data = []

for file in os.listdir(folder_path):
    if file.endswith('.log'):
        file_path = os.path.join(folder_path, file)

        try:
            df = pd.read_csv(file_path, header=None, names=['timestamp', 'source', 'event_type', 'full_user'], engine='python')

            # Clean quotes from string columns
            for col in df.select_dtypes(include='object').columns:
                df[col] = df[col].map(lambda x: x.strip('"') if isinstance(x, str) else x)

            # Parse date from timestamp
            df['date'] = df['timestamp'].apply(try_parse_date)

            # Warn if any dates failed to parse
            if df['date'].isna().sum() > 0:
                print(f"[!] Warning: {df['date'].isna().sum()} unparsed dates in {file}")

            # Extract user from domain\user string
            df['user'] = df['full_user'].apply(lambda x: x.split('\\')[-1] if '\\' in x else x)

            all_data.append(df[['date', 'user', 'event_type']])

        except Exception as e:
            print(f"[ERROR] Failed processing {file}: {e}")

# Combine all log entries
combined = pd.concat(all_data, ignore_index=True)

# Group by user and date, then count each event_type
summary = combined.groupby(['user', 'date', 'event_type']).size().unstack(fill_value=0).reset_index()

# Calculate per-user daily stats
event_cols = summary.columns.difference(['user', 'date'])
summary['total_events'] = summary[event_cols].sum(axis=1)
summary['unique_event_types'] = (summary[event_cols] > 0).sum(axis=1)

# Normalize event counts into ratios
for col in event_cols:
    summary[f'{col}_ratio'] = summary[col] / summary['total_events']

# Drop raw event counts (keep only ratios + metadata)
summary = summary.drop(columns=event_cols)

# Expected columns (to ensure consistent structure)
expected_types = ['LogInAttempt', 'LogOff', 'LogOnSuccess', 'ProcessStart', 'FileAccess', 'AccessDenied', 'FileModified']
for evt in expected_types:
    col_name = f'{evt}_ratio'
    if col_name not in summary.columns:
        summary[col_name] = 0.0

# Reorder columns
metadata_cols = ['user', 'date', 'total_events', 'unique_event_types']
ratio_cols = sorted([c for c in summary.columns if c not in metadata_cols + ['user', 'date']])
summary = summary[['user', 'date'] + ratio_cols + ['total_events', 'unique_event_types']]

# Save the processed dataset
summary.to_csv(output_path, index=False)

print("✅ Event viewer summary saved.")
print(f"📁 Path: {output_path}")
print(f"📊 Rows: {summary.shape[0]} | Columns: {summary.shape[1]}")


✅ Event viewer summary saved.
📁 Path: C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_eventviewer_summary.csv
📊 Rows: 150 | Columns: 13
