# Presequite

In [None]:
!wget -O /home/conn.log.gz https://raw.githubusercontent.com/un5eeny0t/UMCS-notebook/main/conn.log.gz


# Zeek Log Analysis in Colab





**Install Dependencies**

---



In [None]:
# Install any necessary packages
!pip install pandas matplotlib



**This code defines a function load_zeek_log that loads Zeek log files (compressed or plain text) into a pandas DataFrame, extracting headers from the #fields line and data from non-comment lines.**

---



In [None]:
import pandas as pd
import gzip
from io import StringIO

def load_zeek_log(filepath):
    # Automatically handle .gz files
    open_func = gzip.open if filepath.endswith('.gz') else open

    with open_func(filepath, 'rt') as file:
        lines = file.readlines()

    # Extract header line
    header_line = next((line for line in lines if line.startswith('#fields')), None)
    if not header_line:
        raise ValueError("No #fields line found in Zeek log")

    headers = header_line.strip().split()[1:]  # Remove '#fields'

    # Extract data lines
    data_lines = [line for line in lines if not line.startswith('#')]

    # Load into DataFrame
    return pd.read_csv(StringIO(''.join(data_lines)), sep='\t', names=headers)


**This code loads the Zeek connection log (conn.log.gz) into a pandas DataFrame using the load_zeek_log function and displays the first few rows of the DataFrame with head().**

---



In [None]:
conn_df = load_zeek_log("/home/conn.log.gz")
conn_df.head()


# Basic analysis

---



# Top 10 source IPs

**Counts the occurrences of unique values in the id.orig_h column (which likely represents the original host IP addresses) of the conn_df DataFrame and displays the top 10 most frequent values.**

---



In [None]:
conn_df['id.orig_h'].value_counts().head(10)


# Top Destination Ports

**Counts the occurrences of unique values in the id.resp_p column (which likely represents the response port numbers) of the conn_df DataFrame and displays the top 10 most frequent values.**

---



In [None]:
conn_df['id.resp_p'].value_counts().head(10)


# Most Frequent Protocols

**Counts the occurrences of unique values in the proto column (which represents the protocol used, such as TCP, UDP, etc.) of the conn_df DataFrame and displays the counts for each protocol.**

---



In [None]:
conn_df['proto'].value_counts()


Plot Traffic Volume

 **Creates a bar chart that visualizes the frequency of each unique value in the proto column (representing network protocols like TCP, UDP, etc.) of the conn_df DataFrame using pandas' built-in plotting functionality.**


---



In [None]:
conn_df['proto'].value_counts().plot(kind='bar')


**Defines a function load_zeek_log to load a gzipped Zeek log file (conn.log.gz), read its content line by line, and return the lines as a list. Then, it loads the log file and prints the first 10 lines of the file for preview.**

---



In [None]:
import gzip

# Load gzipped Zeek log file
def load_zeek_log(file_path):
    with gzip.open(file_path, 'rt') as f:  # 'rt' = read text mode
        lines = f.readlines()
    return lines

# Load the conn.log.gz
log_lines = load_zeek_log("/home/conn.log.gz")

# Preview the first 10 lines
for line in log_lines[:10]:
    print(line.strip())


**Defines a function parse_zeek_log that parses a list of Zeek log lines, extracts the column names from the #fields line, skips comment lines, and creates a pandas DataFrame with the log data. It then parses the conn.log content (log_lines) and displays the first few rows of the resulting DataFrame.**

---



In [None]:
import pandas as pd

def parse_zeek_log(lines):
    columns = []
    data = []
    for line in lines:
        if line.startswith('#fields'):
            columns = line.strip().split('\t')[1:]  # Skip '#fields'
        elif line.startswith('#'):
            continue  # Skip other comment lines
        else:
            data.append(line.strip().split('\t'))
    return pd.DataFrame(data, columns=columns)

# Parse conn.log
df = parse_zeek_log(log_lines)

# Show the DataFrame
df.head()


**Generates a bar chart that visualizes the top 10 source IP addresses (id.orig_h) from the df DataFrame by their count. The chart is displayed with a title, axis labels, and rotated x-axis labels for better readability. The plot uses matplotlib to create and display the graph.**


---



In [None]:
import matplotlib.pyplot as plt

# Plot top 10 source IPs by count
top_ips = df['id.orig_h'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_ips.plot(kind='bar', color='skyblue')
plt.title('Top 10 Source IPs')
plt.xlabel('Source IP')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


**Generates a bar chart that shows the distribution of connection states (conn_state) from the df DataFrame. It plots the count of each unique connection state, with a title, axis labels, and rotated x-axis labels for clarity. The plot uses matplotlib to display the results.**

---



In [None]:
# Plot distribution of connection states
plt.figure(figsize=(8, 5))
df['conn_state'].value_counts().plot(kind='bar', color='lightcoral')
plt.title('Distribution of Connection States')
plt.xlabel('Connection State')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# Threat Hunting


---



In [None]:
import requests
from zipfile import ZipFile

# === Step 1: Download the ZIP file ===

# URL of the malware traffic capture
URL = "https://www.malware-traffic-analysis.net/2022/01/03/2022-01-03-three-days-of-server-probes-including-log4j-attempts.pcap.zip"

# Path to save the downloaded ZIP file
OUTPUT = "/home/malware-pcap.zip"

# Send GET request with streaming enabled
response = requests.get(URL, stream=True)

# Download and write to file if response is good
if response.status_code == 200:
    size_kb = int(response.headers.get("Content-Length", 0)) / 1024
    print(f"Download good, writing {size_kb:.0f} KBytes to {OUTPUT}")

    with open(OUTPUT, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

# === Step 2: Unzip the file with password ===

try:
    with ZipFile(OUTPUT, "r") as zip_ref:
        zip_ref.extractall(path="/home", pwd="infected_20220103".encode("utf-8"))
        print("Extraction complete.")
except RuntimeError as e:
    print(f"Failed to extract ZIP file: {e}")


In [None]:
!sudo add-apt-repository ppa:oisf/suricata-stable -y
!sudo apt update
!sudo apt install -y suricata
!suricata --version

In [None]:
!suricata-update
!suricata --version
!suricata-update list-sources
!suricata-update enable-source tgreen/hunting

In [None]:
LOGDIR = "/tmp/logs"

In [None]:
!rm -rf $LOGDIR && $mkdir $LOGDIR && ls -lah $LOGDIR

In [None]:
!mkdir -p /tmp/logs

In [None]:
LOG4J_PCAP = "/home/2022-01-03-three-days-of-server-probes-including-log4j-attempts.pcap"

In [None]:
!whereis suricata

In [None]:
!/usr/bin/suricata -S /var/lib/suricata/rules/suricata.rules -l $LOGDIR -r $LOG4J_PCAP -v

In [None]:
import json

In [None]:
with open ("/tmp/logs/eve.json", "r") as handle:
  handle.readline ()
  for line in handle:
    eve = json.loads (line)
    if eve.get ("event_type", "") == "alert" and eve.get("alert", {}).get("severity") == 1:
      print (json.dumps(eve, indent=2) )
      break

In [None]:
%pip install pandas

In [None]:
import pandas as pd

pd.options.display.html.use_mathjax = False

pd.DataFrame ( [{"src_ip": "1.1.1.1", "flow_id": 123}, {"src_ip": "2.2.2.2", "flow_id": 124}])

In [None]:
with open ("/tmp/logs/eve.json", "r") as handle:
  DF = pd. json_normalize ([
  json. loads (line) for line in handle

])

DF

In [None]:
DF.shape

In [None]:
print("dataframe has %d rows and %d columns" % DF.shape)

In [None]:
COLS_STATS = [c for c in list (DF.columns.values) if c.startswith("stats") ]
len(COLS_STATS)

In [None]:
print ("%d stats cols from total %d" % (len (COLS_STATS), len (DF.columns.values) ) )

In [None]:
COLS_DATA = [c for c in list (DF.columns.values) if not c.startswith ("stats") ]
print ("%d data columns" % len (COLS_DATA) )

In [None]:
DF[COLS_DATA]

In [None]:
DF.describe()

In [None]:
DF.head()

In [None]:
for col in ["src_port", "dest_port", "pcap_cnt"]:
  DF[col] = (
      DF[col]
      .fillna (0)
      .astype (int)
  )

In [None]:
DF["timestamp"] = pd.to_datetime(DF["timestamp"])

In [None]:
DF

In [None]:
DF.event_type == "alert"

In [None]:
DF_ALERT = (
  DF
  .loc[DF.event_type == "alert"]
  .loc[DF ["alert.category"] != "Generic Protocol Command Decode"]
  .sort_values (by=["timestamp"], ascending=True)
  .dropna(how="all", axis=1)

)


In [None]:
DF_ALERT["flow_id"] = DF_ALERT["flow_id"].fillna(0).astype("int64").astype(str)

In [None]:
print("dataframe has %d rows and %d columns" % DF_ALERT.shape)

In [None]:
DF_ALERT.head()

In [None]:
DF_ALERT[["timestamp", "flow_id", "flow.src_ip", "flow.dest_ip", "alert.signature", "alert.category"]]

# Adding Malaysia Time

In [None]:
DF_ALERT["timestamp"] = pd.to_datetime(DF_ALERT["timestamp"])
DF_ALERT["timestamp_myt"] = DF_ALERT["timestamp"].dt.tz_convert("Asia/Kuala_Lumpur")
DF_ALERT["timestamp_myt"] = DF_ALERT["timestamp_myt"].dt.strftime("%Y-%m-%d %H:%M:%S")
DF_ALERT[["timestamp", "timestamp_myt", "flow_id", "flow.src_ip", "flow.dest_ip", "alert.signature", "alert.category"]]