In [1]:
import sys
!{sys.executable} -m pip install python-evtx==0.8.1 lxml pandas -q
import importlib
import Evtx  # Force load
print("✅ Evtx imported:", Evtx.__file__)

✅ Evtx imported: /opt/anaconda3/lib/python3.13/site-packages/Evtx/__init__.py


In [2]:
# Install packages
!pip uninstall evtx python-evtx -y
!pip install evtx==0.8.9 pandas scikit-learn scipy seaborn matplotlib pyyaml

[0mFound existing installation: python-evtx 0.8.1
Uninstalling python-evtx-0.8.1:
  Successfully uninstalled python-evtx-0.8.1
Collecting evtx==0.8.9
  Using cached evtx-0.8.9-cp37-abi3-macosx_11_0_arm64.whl.metadata (3.1 kB)
Using cached evtx-0.8.9-cp37-abi3-macosx_11_0_arm64.whl (884 kB)
Installing collected packages: evtx
Successfully installed evtx-0.8.9


In [3]:
# MITRE Threads Table
import pandas as pd
import json
from evtx import PyEvtxParser  # ✅ CORRECT import

# MITRE table
threats = pd.DataFrame({
    'MITRE_ID': ['T1218.001', 'T1059.001'], 
    'Technique': ['rundll32 LOLBin', 'PowerShell Encoded'],
    'EVTX_File': ['rundll32.evtx', 'powershell.evtx'],
    'Sigma_Target': ['rundll32.*http', 'powershell -enc']
})
print("MITRE Threats:")
print(threats.to_markdown())

MITRE Threats:
|    | MITRE_ID   | Technique          | EVTX_File       | Sigma_Target    |
|---:|:-----------|:-------------------|:----------------|:----------------|
|  0 | T1218.001  | rundll32 LOLBin    | rundll32.evtx   | rundll32.*http  |
|  1 | T1059.001  | PowerShell Encoded | powershell.evtx | powershell -enc |


In [4]:
!pip install python-evtx lxml

from Evtx.Evtx import Evtx
import xml.etree.ElementTree as ET

def parse_evtx_safe(file_path):
    events = []
    with Evtx(file_path) as log:
        for record in log.records():
            try:
                xml = ET.fromstring(record.xml())
                events.append({
                    'EventID': xml.find('.//EventID').text,
                    'TimeCreated': xml.find('.//TimeCreated').text
                })
            except:
                continue
            if len(events) >= 10: break
    return events

logs = parse_evtx_safe('rundll32.evtx')
print(f"✅ Parsed {len(logs)} events!")
print(pd.DataFrame(logs).head())

Collecting python-evtx
  Using cached python_evtx-0.8.1-py3-none-any.whl.metadata (6.0 kB)
Using cached python_evtx-0.8.1-py3-none-any.whl (26 kB)
Installing collected packages: python-evtx
Successfully installed python-evtx-0.8.1
✅ Parsed 0 events!
Empty DataFrame
Columns: []
Index: []


In [5]:
import requests

# Download T1218.001 sample
url = "https://github.com/sbousseaden/EVTX-ATTACK-SAMPLES/raw/master/T1218_001/System/S-0001_rundll32_http.evtx"
response = requests.get(url)
with open("rundll32.evtx", "wb") as f:
    f.write(response.content)
print("Downloaded rundll32.evtx (T1218.001)")

# Extract process creation command lines (Event ID 4688)
cmdlines = []
with Evtx("rundll32.evtx") as log:
    for record in log.records():
        try:
            xml = ET.fromstring(record.xml())
            cmd_elem = xml.find('.//Data[@Name="NewProcessCommandLine"]')
            if cmd_elem is not None:
                cmdlines.append(cmd_elem.text)
        except:
            continue

df_cmd = pd.DataFrame({'CommandLine': cmdlines})
print(f"Extracted {len(df_cmd)} command lines:")
print(df_cmd.head().to_markdown())
df_cmd.to_csv('cmdlines.csv', index=False)


Downloaded rundll32.evtx (T1218.001)
Extracted 0 command lines:
| CommandLine   |
|---------------|


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest

# Vectorize command lines
vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
X = vectorizer.fit_transform(df_cmd['CommandLine'].fillna(''))

# Train anomaly detector
model = IsolationForest(contamination=0.3, random_state=42)
anomaly_scores = model.fit_predict(X)

# Results
df_results = pd.DataFrame({
    'CommandLine': df_cmd['CommandLine'],
    'AnomalyScore': anomaly_scores
})
print("Anomalous command lines (score = -1):")
print(df_results[df_results['AnomalyScore'] == -1].to_markdown())
df_results.to_csv('anomalies.csv', index=False)


ValueError: empty vocabulary; perhaps the documents only contain stop words