In [16]:
import warnings
from datetime import datetime
from elasticsearch import Elasticsearch
from sklearn.ensemble import IsolationForest
import pandas as pd
import os
import sys
from pathlib import Path

In [17]:
import warnings
import pandas as pd
from sklearn.ensemble import IsolationForest
from elasticsearch import Elasticsearch
from ipaddress import ip_network, ip_address
import numpy as np

warnings.filterwarnings("ignore")

# Configuration
SUSPICIOUS_SUBNETS = ["10.0.0.0/8", "192.168.0.0/16"]
FAILURE_THRESHOLD = 5
MIN_SESSION_DURATION = 5
MAX_SESSION_DURATION = 1800

def safe_get(obj, key, default=""):
    """Safely get value from nested dicts/lists."""
    if isinstance(obj, dict):
        return obj.get(key, default)
    elif isinstance(obj, list):
        return obj[0].get(key, default) if len(obj) > 0 else default
    return default

def fetch_logs(es_client, index_pattern="cowrie-*", size=5000):
    """Fetch logs with error handling."""
    try:
        response = es_client.search(
            index=index_pattern,
            body={"query": {"match_all": {}}, "size": size}
        )
        print(f"Fetched {len(response['hits']['hits'])} logs")
        return response['hits']['hits']
    except Exception as e:
        print(f"Elasticsearch error: {e}")
        return []

def is_suspicious_ip(ip):
    """Check if IP is in suspicious ranges."""
    try:
        return any(ip_address(ip) in ip_network(subnet) for subnet in SUSPICIOUS_SUBNETS)
    except:
        return False

def calculate_features(logs):
    """Engineer features with debug output."""
    df = pd.DataFrame([hit['_source'] for hit in logs])
    
    # Debug: Show available columns
    print("\n[DEBUG] Available columns in logs:")
    print(df.columns.tolist())
    
    # Timestamp features
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    
    # Enhanced command extraction
    def get_command(x):
        if isinstance(x.get('input'), str):
            return x['input']
        elif isinstance(x.get('input'), list) and len(x['input']) > 0:
            return x['input'][0]
        elif isinstance(x.get('message'), str):
            return x['message']
        return str(x.get('eventid', ''))

    df['command'] = df.apply(get_command, axis=1)
    print("\n[DEBUG] Top 10 commands found:")
    print(df['command'].value_counts().head(10))
    
    # Command frequency
    df['command_count'] = df['command'].map(
        lambda x: df['command'].value_counts().get(x, 0))
    
    # IP analysis
    df['src_ip'] = df['src_ip'].fillna('0.0.0.0')
    df['is_private_ip'] = df['src_ip'].apply(is_suspicious_ip)
    df['ip_frequency'] = df['src_ip'].map(
        lambda x: df['src_ip'].value_counts().get(x, 0))
    
    # Session analysis
    df['session'] = df['session'].fillna('no-session')
    session_groups = df.groupby('session')['timestamp']
    df['session_duration'] = session_groups.transform(
        lambda x: (x.max() - x.min()).total_seconds()
    ).fillna(0)
    
    # Failed logins
    df['is_failed'] = df['eventid'] == 'cowrie.login.failed'
    df['failures_per_ip'] = df['src_ip'].map(
        df[df['is_failed']]['src_ip'].value_counts()).fillna(0)
    
    return df

def detect_anomalies(logs):
    """Run anomaly detection with detailed reporting."""
    if not logs:
        print("No logs to analyze")
        return None

    try:
        df = calculate_features(logs)
        
        # Feature selection
        features = df[[
            'hour',
            'day_of_week',
            'command_count',
            'ip_frequency',
            'failures_per_ip',
            'session_duration'
        ]].fillna(0).astype(float)
        
        # Dynamic contamination
        contamination = min(0.1, max(0.01, 10 / len(df)))
        print(f"\nAnalyzing {len(df)} events with contamination={contamination:.2f}")
        
        model = IsolationForest(
            contamination=contamination,
            random_state=42,
            n_estimators=200
        )
        df['anomaly_score'] = model.fit_predict(features)
        df['is_anomaly'] = df['anomaly_score'] == -1
        
        return df
    except Exception as e:
        print(f"Detection failed: {e}")
        return None

if __name__ == "__main__":
    es = Elasticsearch("http://localhost:9200")
    print("Fetching logs...")
    logs = fetch_logs(es)
    
    if logs:
        print("\nDetecting anomalies...")
        results = detect_anomalies(logs)
        
        if results is not None:
            anomalies = results[results['is_anomaly']]
            if not anomalies.empty:
                print("\n=== SECURITY ALERTS ===")
                print(f"Detected {len(anomalies)} suspicious events")
                
                print("\n[ATTACK PATTERNS]")
                print("Top malicious commands:")
                print(anomalies['command'].value_counts().head(10))
                
                print("\nTop attacking IPs:")
                print(anomalies['src_ip'].value_counts().head(5))
                
                print("\n[SAMPLE EVENTS]")
                print(anomalies[['timestamp', 'src_ip', 'command', 
                               'session_duration', 'failures_per_ip']].head())
                
                # Save anomalies before alerting
                anomalies_file = "anomalies.csv"
                anomalies.to_csv(anomalies_file, index=False)
                print(f"\nSaved {len(anomalies)} anomalies to {anomalies_file}")
                
                # Alerting integration - FIXED VERSION
                try:
                    import os
                    import sys
                    from pathlib import Path
                    
                    # Get the current directory (works in both scripts and notebooks)
                    try:
                        script_dir = Path(__file__).absolute().parent
                    except NameError:
                        # Fallback for environments without __file__
                        script_dir = Path(os.getcwd())
                    
                    # Navigate to project root and find alerting directory
                    project_root = script_dir.parent
                    alerting_path = str(project_root / "alerting")
                    
                    if alerting_path not in sys.path:
                        sys.path.insert(0, alerting_path)
                        
                    from alert_manager import AlertManager
                    
                    alert_manager = AlertManager()
                    if alert_manager.send_alert(anomalies_file):
                        print("‚úÖ Security alerts sent successfully")
                    else:
                        print("‚ö†Ô∏è Alert sending failed")
                except ImportError as e:
                    print(f"‚ö†Ô∏è Alerting module import failed: {e}")
                    print("Please ensure:")
                    print(f"1. The 'alerting' directory exists at: {alerting_path}")
                    print("2. It contains 'alert_manager.py' with proper email configuration")
                except Exception as e:
                    print(f"‚ö†Ô∏è Alerting failed: {str(e)}")
            else:
                print("No anomalies detected")

Fetching logs...
Fetched 667 logs

Detecting anomalies...

[DEBUG] Available columns in logs:
['@timestamp', 'agent', 'log', 'input', 'message', 'sensor', 'timestamp', 'src_ip', 'session', 'eventid', 'ecs', 'host', 'duration', 'ttylog', 'duplicate', 'shasum', 'size', 'src_port', 'protocol', 'dst_port', 'dst_ip', 'version', 'kexAlgs', 'encCS', 'compCS', 'keyAlgs', 'hasshAlgorithms', 'macCS', 'langCS', 'hassh', 'username', 'password', 'height', 'width', 'arch']

[DEBUG] Top 10 commands found:
command
Remote SSH version: SSH-2.0-OpenSSH_for_Windows_9.5               121
SSH client hassh fingerprint: 701158e75b508e76f0410d5d22ef9df0    121
Connection lost after 0.1 seconds                                  55
cowrie.session.params                                              24
Connection lost after 0.2 seconds                                  22
login attempt [root/anything] succeeded                            16
CMD: exit                                                          14
CMD: w

In [20]:
import warnings
import pandas as pd
from sklearn.ensemble import IsolationForest
from elasticsearch import Elasticsearch
from ipaddress import ip_network, ip_address
import numpy as np

warnings.filterwarnings("ignore")

# Configuration
SUSPICIOUS_SUBNETS = ["10.0.0.0/8", "192.168.0.0/16"]
FAILURE_THRESHOLD = 5
MIN_SESSION_DURATION = 5
MAX_SESSION_DURATION = 1800

def safe_get(obj, key, default=""):
    """Safely get value from nested dicts/lists."""
    if isinstance(obj, dict):
        return obj.get(key, default)
    elif isinstance(obj, list) and len(obj) > 0 and isinstance(obj[0], dict):
        return obj[0].get(key, default)
    return default

def fetch_logs(es_client, index_pattern="cowrie-*", size=5000):
    """Fetch logs with error handling."""
    try:
        response = es_client.search(
            index=index_pattern,
            body={"query": {"match_all": {}}, "size": size}
        )
        hits = response.get('hits', {}).get('hits', [])
        print(f"Fetched {len(hits)} logs")
        return hits
    except Exception as e:
        print(f"Elasticsearch error: {e}")
        return []

def is_suspicious_ip(ip):
    """Check if IP is in suspicious ranges."""
    try:
        return any(ip_address(ip) in ip_network(subnet) for subnet in SUSPICIOUS_SUBNETS)
    except ValueError:
        return False

def calculate_features(logs):
    """Engineer features with debug output."""
    df = pd.DataFrame([hit.get('_source', {}) for hit in logs])

    if df.empty:
        raise ValueError("No log data available to process.")
    
    # Timestamp features
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df.dropna(subset=['timestamp'])  # Drop rows where timestamp couldn't be parsed
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    
    # Enhanced command extraction
    def get_command(row):
        input_val = row.get('input')
        if isinstance(input_val, str):
            return input_val
        elif isinstance(input_val, list) and len(input_val) > 0:
            return input_val[0]
        elif isinstance(row.get('message'), str):
            return row['message']
        return str(row.get('eventid', ''))

    df['command'] = df.apply(get_command, axis=1)
    
    # Command frequency
    command_counts = df['command'].value_counts()
    df['command_count'] = df['command'].map(command_counts)
    
    # IP analysis
    df['src_ip'] = df['src_ip'].fillna('0.0.0.0')
    df['is_private_ip'] = df['src_ip'].apply(is_suspicious_ip)
    ip_counts = df['src_ip'].value_counts()
    df['ip_frequency'] = df['src_ip'].map(ip_counts)
    
    # Session analysis
    df['session'] = df['session'].fillna('no-session')
    session_groups = df.groupby('session')['timestamp']
    df['session_duration'] = session_groups.transform(
        lambda x: (x.max() - x.min()).total_seconds()
    ).fillna(0)
    
    # Failed logins
    df['is_failed'] = df['eventid'] == 'cowrie.login.failed'
    failed_counts = df[df['is_failed']]['src_ip'].value_counts()
    df['failures_per_ip'] = df['src_ip'].map(failed_counts).fillna(0)
    
    return df

def detect_anomalies(logs):
    """Run anomaly detection with detailed reporting."""
    if not logs:
        print("No logs to analyze")
        return None

    try:
        df = calculate_features(logs)
        
        # Feature selection
        features = df[[
            'hour',
            'day_of_week',
            'command_count',
            'ip_frequency',
            'failures_per_ip',
            'session_duration'
        ]].fillna(0).astype(float)
        
        # Dynamic contamination rate
        contamination = min(0.1, max(0.01, 10 / len(df)))
        print(f"\nAnalyzing {len(df)} events with contamination={contamination:.2f}")
        
        model = IsolationForest(
            contamination=contamination,
            random_state=42,
            n_estimators=200
        )
        df['anomaly_score'] = model.fit_predict(features)
        df['is_anomaly'] = df['anomaly_score'] == -1
        
        return df
    except Exception as e:
        print(f"Detection failed: {e}")
        return None

if __name__ == "__main__":
    es = Elasticsearch("http://localhost:9200")
    print("Fetching logs...")
    logs = fetch_logs(es)
    
    if logs:
        print("\nDetecting anomalies...")
        results = detect_anomalies(logs)
        
        if results is not None:
            anomalies = results[results['is_anomaly']]
            if not anomalies.empty:
                print("\n=== SECURITY ALERTS ===")
                print(f"Detected {len(anomalies)} suspicious events")
                
                print("\n[ATTACK PATTERNS]")
                print("Top malicious commands:")
                print(anomalies['command'].value_counts().head(10))
                
                print("\nTop attacking IPs:")
                print(anomalies['src_ip'].value_counts().head(5))
                
                print("\n[SAMPLE EVENTS]")
                print(anomalies[['timestamp', 'src_ip', 'command', 
                               'session_duration', 'failures_per_ip']].head())
                
                # Save anomalies before alerting
                anomalies_file = "anomalies.csv"
                anomalies.to_csv(anomalies_file, index=False)
                print(f"\nSaved {len(anomalies)} anomalies to {anomalies_file}")

                import subprocess
                import psutil
                
                def is_streamlit_running(script_name="app.py"):
                    """Check if Streamlit is already running with the given script."""
                    for proc in psutil.process_iter(attrs=["cmdline"]):
                        try:
                            cmdline = proc.info["cmdline"]
                            if cmdline and "streamlit" in cmdline[0] and script_name in " ".join(cmdline):
                                return True
                        except (psutil.NoSuchProcess, psutil.AccessDenied):
                            continue
                    return False
                
                # ‚úÖ Launch Streamlit only if it's not already running
                try:
                    if not is_streamlit_running("app.py"):
                        print("\nüöÄ Launching Streamlit dashboard...")
                        subprocess.Popen(["streamlit", "run", "app.py"])
                        print("‚úÖ Streamlit dashboard launched.")
                    else:
                        print("‚ÑπÔ∏è Streamlit dashboard is already running.")
                except Exception as e:
                    print(f"‚ùå Failed to launch dashboard: {e}")

                # Alerting integration
                try:
                    import os
                    import sys
                    from pathlib import Path
                    
                    try:
                        script_dir = Path(__file__).absolute().parent
                    except NameError:
                        script_dir = Path(os.getcwd())
                    
                    project_root = script_dir.parent
                    alerting_path = str(project_root / "alerting")
                    
                    if alerting_path not in sys.path:
                        sys.path.insert(0, alerting_path)
                        
                    from alert_manager import AlertManager
                    
                    alert_manager = AlertManager()
                    if alert_manager.send_alert(anomalies_file):
                        print("‚úÖ Security alerts sent successfully")
                    else:
                        print("‚ö†Ô∏è Alert sending failed")
                except ImportError as e:
                    print(f"‚ö†Ô∏è Alerting module import failed: {e}")
                    print("Please ensure:")
                    print(f"1. The 'alerting' directory exists at: {alerting_path}")
                    print("2. It contains 'alert_manager.py' with proper email configuration")
                except Exception as e:
                    print(f"‚ö†Ô∏è Alerting failed: {str(e)}")
            else:
                print("No anomalies detected")
        else:
            print("Error during anomaly detection")
    else:
        print("No logs fetched from Elasticsearch")


Fetching logs...
Fetched 679 logs

Detecting anomalies...

Analyzing 679 events with contamination=0.01

=== SECURITY ALERTS ===
Detected 8 suspicious events

[ATTACK PATTERNS]
Top malicious commands:
command
Remote SSH version: SSH-2.0-OpenSSH_for_Windows_9.5               4
SSH client hassh fingerprint: 701158e75b508e76f0410d5d22ef9df0    4
Name: count, dtype: int64

Top attacking IPs:
src_ip
172.18.0.1    4
172.19.0.1    4
Name: count, dtype: int64

[SAMPLE EVENTS]
                           timestamp      src_ip  \
123 2025-05-23 13:39:29.738415+00:00  172.18.0.1   
124 2025-05-23 13:39:29.777265+00:00  172.18.0.1   
152 2025-05-23 13:39:29.777265+00:00  172.18.0.1   
190 2025-05-23 13:39:29.738415+00:00  172.18.0.1   
199 2025-05-25 16:11:53.715836+00:00  172.19.0.1   

                                               command  session_duration  \
123  Remote SSH version: SSH-2.0-OpenSSH_for_Window...          0.194360   
124  SSH client hassh fingerprint: 701158e75b508e76...        

In [19]:
!pip install psutil

