In [None]:
import pandas as pd
import numpy as np
import ipaddress

# Read the conn.log file into a DataFrame
conn_log_file = 'data/zeek-logs/conn.log'
conn_log_df = pd.read_csv(conn_log_file, sep='\s+')
conn_log_df



In [None]:
conn_log_df = conn_log_df[conn_log_df['id.orig_h'].apply(lambda x: ipaddress.ip_address(x).version == 4)]

# Extract timestamps and convert them to datetime objects
timestamps = pd.to_datetime(conn_log_df['ts'], unit='s')

# Calculate time differences between consecutive timestamps
time_diffs = timestamps.diff().dropna()

# Group time differences by second and calculate mean for each second
time_diffs_per_second = time_diffs.dt.total_seconds().groupby(timestamps.dt.floor('s')).mean()

# Compute overall mean arrival time per second
mean_arrival_time_per_second = time_diffs_per_second.mean()

print("Mean arrival time of packets per second:", mean_arrival_time_per_second, "packets")

In [None]:
len(conn_log_df)

In [None]:
len(conn_log_df['id.orig_h'].unique())

In [None]:
local_ips = ["104.248.118.173","104.248.19.10","104.248.29.106","107.170.61.18","107.170.61.19","137.184.22.231","138.68.190.139","142.93.143.204","143.110.192.246","152.42.169.228","159.203.68.137","159.203.77.200","159.223.11.117","159.223.95.229","159.65.151.72","159.65.155.240","159.65.185.236","159.65.64.177","159.65.72.64","159.89.113.129","159.89.121.138","159.89.160.101","159.89.53.114","162.243.14.30","164.90.221.158","164.92.104.204","164.92.83.209","165.227.40.189","170.64.217.183","170.64.217.184","170.64.217.68","174.138.15.14","178.62.10.136","188.166.252.40","188.226.134.9","188.226.192.6","192.241.200.18","192.241.201.11","192.241.201.8","209.97.185.211","64.23.233.193","82.196.3.14"]
fdf = conn_log_df[conn_log_df['id.resp_h'].isin(local_ips)]
len(conn_log_df)  - len(fdf)

In [None]:
#fdf['duration'].replace('-', np.nan, inplace=True)
#fdf['duration'] = fdf['duration'].astype(float)
#fdf.mean()

In [None]:
len(fdf['id.orig_h'].unique())

In [None]:
# Count occurrences of each protocol
protocol_counts = conn_log_df['proto'].value_counts()

# Calculate percentage of each protocol
total_connections = len(conn_log_df)
protocol_percentages = (protocol_counts / total_connections) * 100

# Display top protocols and their percentages
top_protocols = protocol_percentages.head(10)  # Adjust the number if needed
print("Top used protocols and their percentages:")
print(top_protocols)

In [None]:
# Count occurrences of each destination port
destination_port_counts = conn_log_df['id.resp_p'].value_counts()

# Calculate percentage of each destination port
total_connections = len(conn_log_df)
destination_port_percentages = (destination_port_counts / total_connections) * 100

# Display top destination ports and their percentages
top_destination_ports = destination_port_percentages.head(20)  # Adjust the number if needed
print("Top destination ports and their percentages:")
print(top_destination_ports)

In [None]:
import matplotlib.pyplot as plt

# Extract response ports
response_ports = conn_log_df['id.resp_p']

# Count occurrences of each response port
response_port_counts = response_ports.value_counts()

# Calculate frequency of requests per second for each response port
conn_log_df['ts'] = pd.to_datetime(conn_log_df['ts'], unit='s')
conn_log_df['time_seconds'] = conn_log_df['ts'].dt.floor('s').dt.time
requests_per_second = conn_log_df.groupby(['time_seconds', 'id.resp_p']).size().groupby('id.resp_p').mean()

# Sort response ports by the number of packets received
sorted_response_ports = response_port_counts.sort_values(ascending=False)

# Calculate log10 of rank and log10 of frequency
log_rank = np.log10(range(1, len(sorted_response_ports) + 1))
log_frequency = np.log10(requests_per_second[sorted_response_ports.index])

# Plot rank-frequency plot
plt.figure(figsize=(10, 6))
plt.scatter(log_rank, log_frequency, marker='.', color='blue')
plt.xlabel('log10 Rank of Port by Packets Received')
plt.ylabel('log10 Frequency of Requests per Second')
plt.title('Rank-Frequency Plot of Response Ports')
plt.grid(True)
plt.show()

In [None]:
# Extract source IPs
source_ips = conn_log_df['id.orig_h']

# Count occurrences of each source IP
source_ip_counts = source_ips.value_counts()

# Calculate frequency of requests per second for each source IP
conn_log_df['ts'] = pd.to_datetime(conn_log_df['ts'], unit='s')
conn_log_df['time_seconds'] = conn_log_df['ts'].dt.floor('s').dt.time
requests_per_second = conn_log_df.groupby(['time_seconds', 'id.orig_h']).size().groupby('id.orig_h').mean()

# Sort source IPs by the number of packets sent
sorted_source_ips = source_ip_counts.sort_values(ascending=False)

# Calculate log10 of rank and log10 of frequency
log_rank = np.log10(range(1, len(sorted_source_ips) + 1))
log_frequency = np.log10(requests_per_second[sorted_source_ips.index])

# Plot rank-frequency plot
plt.figure(figsize=(10, 6))
plt.scatter(log_rank, log_frequency, marker='.', color='blue')
plt.xlabel('log10 Rank of IP by Packets Sent')
plt.ylabel('log10 Frequency of Requests per Second')
plt.title('Rank-Frequency Plot of Source IPs')
plt.grid(True)
plt.show()

In [None]:
# Convert timestamps to datetime objects and extract date and hour
conn_log_df['ts'] = pd.to_datetime(conn_log_df['ts'], unit='s')
conn_log_df['date'] = conn_log_df['ts'].dt.date
conn_log_df['hour'] = conn_log_df['ts'].dt.hour

# Group the data by date and hour and calculate total number of packets per hour
traffic_per_hour = conn_log_df.groupby(['date', 'hour']).size().reset_index(name='packets_per_hour')

# Calculate packets per second aggregated per hour
traffic_per_hour['packets_per_second'] = traffic_per_hour['packets_per_hour'] / 3600

# Plot the aggregated packets per second over time
plt.figure(figsize=(12, 6))
plt.scatter(traffic_per_hour['date'], traffic_per_hour['packets_per_second'])
plt.xlabel('Date')
plt.ylabel('Packets per Second (Aggregated per Hour)')
plt.title('Total Network Traffic Over Time')
plt.grid(True)
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Sample data preprocessing: extracting a subset of data
sample_data = conn_log_df.sample(n=5000)
#sample_data = conn_log_df

# Convert IP addresses to numerical values using hashing
sample_data['source_ip_numeric'] = sample_data['id.orig_h'].apply(lambda x: sum(int(octet) << (8 * i) for i, octet in enumerate(x.split('.')[::-1])))
sample_data['destination_ip_numeric'] = sample_data['id.resp_h'].apply(lambda x: sum(int(octet) << (8 * i) for i, octet in enumerate(x.split('.')[::-1])))

# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data
ax.scatter(sample_data['source_ip_numeric'], sample_data['destination_ip_numeric'], sample_data['id.resp_p'])

# Labeling axes
ax.set_xlabel('Source IP')
ax.set_zlabel('Destination Port')
ax.set_ylabel('Destination IP')

plt.title('3D Plot of Source IP, Destination Port, and Destination IP')
plt.show()

In [None]:
conn_log_df['conn_state'].value_counts()

In [None]:
# Replace 'ip2location_data.csv' with the path to your CSV file
csv_file = 'data/ip2location/IPV6-COUNTRY-REGION-CITY-LATITUDE-LONGITUDE-ISP-DOMAIN-MOBILE-USAGETYPE.CSV'

columns = ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain", "mcc", "mnc", "mobile_brand", "usage_type"]
# Assuming your CSV file has headers, if not, set header=None
df = pd.read_csv(csv_file, names=columns, header=None)

def numeric_to_ipv6(numeric_ip):
    ip = ipaddress.ip_address(numeric_ip)
    return str(ip)

# Convert ip string to ipnumber format of ip2location
# see https://blog.ip2location.com/knowledge-base/ipv4-mapped-ipv6-address/
def ipstr_to_int(ip_str):
    parts = ip_str.split('.')
    ipnum = int(parts[0])*pow(256,3)+int(parts[1])*pow(256,2)+int(parts[2])*256+197+281470681743360
    return ipnum

# Convert 'column1' to numeric, coercing non-numeric values to NaN
df['ip_from'] = pd.to_numeric(df['ip_from'], errors='coerce')
df['ip_to'] = pd.to_numeric(df['ip_to'], errors='coerce')

# Drop rows with NaN values (where conversion failed)
df.dropna(subset=['ip_from'], inplace=True)
df.dropna(subset=['ip_to'], inplace=True)

In [None]:
ips = conn_log_df['id.orig_h'].value_counts().head(5)
l = conn_log_df.shape[0]
for ip, val in ips.items():
    print(f"{ip} \t\t {val} \t {round(val/l*100,2)}")

In [None]:
for ip in ips.keys():
    ipnum = ipstr_to_int(str(ip))
    ip2loc = df[(df['ip_from'] <= ipnum) & (df['ip_to'] >= ipnum)] # .iloc[0]
    print(f"IP: {ip} in {ip2loc['country_code'].iloc[0]} City: {ip2loc['city_name'].iloc[0]} ISP: {ip2loc['isp'].iloc[0]} Domain: {ip2loc['domain'].iloc[0]} Usage: {ip2loc['usage_type'].iloc[0]}")

In [None]:
# Count occurrences of each destination port
destination_port_counts = conn_log_df['id.orig_p'].value_counts()

# Calculate percentage of each destination port
total_connections = len(conn_log_df)
destination_port_percentages = (destination_port_counts / total_connections) * 100

# Display top destination ports and their percentages
top_destination_ports = destination_port_percentages.head(20)  # Adjust the number if needed
print("Top origin ports and their percentages:")
print(top_destination_ports)

In [None]:
conn_log_df['id.resp_p'].value_counts().nlargest(10)

In [None]:
rstrh = conn_log_df[conn_log_df['conn_state'] == "SF"]
rstrh

In [None]:
rstrh['id.resp_p'].value_counts()

In [None]:
rstrh['history'].value_counts()

In [None]:
rstrh['proto'].value_counts()

In [None]:
len(rstrh)

In [None]:
rstrh['service'].value_counts()

In [None]:
local_ips = ["104.248.118.173","104.248.19.10","104.248.29.106","107.170.61.18","107.170.61.19","137.184.22.231","138.68.190.139","142.93.143.204","143.110.192.246","152.42.169.228","159.203.68.137","159.203.77.200","159.223.11.117","159.223.95.229","159.65.151.72","159.65.155.240","159.65.185.236","159.65.64.177","159.65.72.64","159.89.113.129","159.89.121.138","159.89.160.101","159.89.53.114","162.243.14.30","164.90.221.158","164.92.104.204","164.92.83.209","165.227.40.189","170.64.217.183","170.64.217.184","170.64.217.68","174.138.15.14","178.62.10.136","188.166.252.40","188.226.134.9","188.226.192.6","192.241.200.18","192.241.201.11","192.241.201.8","209.97.185.211","64.23.233.193","82.196.3.14"]

In [None]:
localcons = rstrh[~rstrh['id.orig_h'].isin(local_ips)]

In [None]:
exips = localcons['id.orig_h']
exips

In [None]:
for ip in exips:
    ipnum = ipstr_to_int(str(ip))
    ip2loc = df[(df['ip_from'] <= ipnum) & (df['ip_to'] >= ipnum)] # .iloc[0]
    print(f"IP: {ip} in {ip2loc['country_code'].iloc[0]} City: {ip2loc['city_name'].iloc[0]} ISP: {ip2loc['isp'].iloc[0]} Domain: {ip2loc['domain'].iloc[0]} Usage: {ip2loc['usage_type'].iloc[0]}")

In [None]:
top5ips = conn_log_df['id.orig_h'].value_counts().nlargest(6).to_dict()
top5ips.pop("192.241.201.11")
print(top5ips.keys())
hist = conn_log_df[conn_log_df['id.orig_h'].isin(top5ips.keys())] #icmp message code?

In [None]:
#localcons = hist[hist['id.orig_h'].isin(local_ips)]
#localcons
len(conn_log_df)

In [None]:
hist['id.orig_h'].value_counts()

In [None]:
hist['id.resp_p'].value_counts() # icmp message type

In [None]:
#top2
t2 = hist[(hist['id.orig_h'] == "221.234.36.218")]
t2['id.resp_p'].value_counts()

In [None]:
t2['id.resp_h'].value_counts()

In [None]:
hist['conn_state'].value_counts()

In [None]:
hist['history'].value_counts().nlargest(5)

In [None]:
hist['proto'].value_counts()

In [None]:
hist['service'].value_counts()

In [None]:
#hist['id.resp_p'].value_counts().nlargest(30)

In [None]:
ips = hist['id.resp_h'].value_counts().nlargest(100)
len(ips)

In [None]:
usages = []
domains = []
countries = []

ip = "159.203.181.133"
ipnum = ipstr_to_int(str(ip))
ip2loc = df[(df['ip_from'] <= ipnum) & (df['ip_to'] >= ipnum)]
print(f"IP: {ip} in {ip2loc['country_code'].iloc[0]} City: {ip2loc['city_name'].iloc[0]} ISP: {ip2loc['isp'].iloc[0]} Domain: {ip2loc['domain'].iloc[0]} Usage: {ip2loc['usage_type'].iloc[0]}")

for ip in top5ips:
    ipnum = ipstr_to_int(str(ip))
    ip2loc = df[(df['ip_from'] <= ipnum) & (df['ip_to'] >= ipnum)] # .iloc[0]
    usages.append(ip2loc['usage_type'].iloc[0])
    domains.append(ip2loc['domain'].iloc[0])
    countries.append(ip2loc['country_code'].iloc[0])
    #print(f"IP: {ip} in {ip2loc['country_code'].iloc[0]} City: {ip2loc['city_name'].iloc[0]} ISP: {ip2loc['isp'].iloc[0]} Domain: {ip2loc['domain'].iloc[0]} Usage: {ip2loc['usage_type'].iloc[0]}")

In [None]:
from collections import Counter

usage_counts = Counter(usages)
domain_counts = Counter(domains)
country_counts = Counter(countries)

In [None]:
print(usage_counts.most_common(10))
print(domain_counts.most_common(10))
print(country_counts.most_common(10))

In [None]:
strange = hist[hist['id.orig_h'] == "192.241.201.11"]
strange

In [None]:
strange['id.resp_h'].value_counts()

In [None]:
strange['history'].value_counts()

In [None]:
# Count the occurrences of each response port
# Filter ports smaller than 1024
colors = ["#440154", "#3b528b", "#21918c", "#5ec962", "#fde725"]
filtered_df = conn_log_df[(conn_log_df['id.resp_p'] < 1024) & (conn_log_df['id.resp_p'] > 10)]
port_counts = filtered_df['id.resp_p'].value_counts().reset_index()
port_counts.columns = ['id.resp_p', 'count']

# Select the top 50 ports
top_50_ports = port_counts.nlargest(30, 'count')
top_50_ports['id.resp_p'] = top_50_ports['id.resp_p'].astype(str)

# Create the bar graph
plt.figure(figsize=(12, 6))
plt.bar(top_50_ports['id.resp_p'], top_50_ports['count'], color=colors[0])

# Add labels and title
plt.xlabel('Response Port')
plt.ylabel('Connections (log)')
plt.yscale('log')
#plt.title('Top 30 Most Common Response Ports')
plt.xticks(rotation=90)  # Show every other port number
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('plots/top_50_ports_bar_graph.pdf')
plt.show()

In [None]:
reg_ports = conn_log_df[(conn_log_df['id.resp_p'] >= 1024) & (conn_log_df['id.resp_p'] < 49151)] #49151

In [None]:
low_ports = conn_log_df[(conn_log_df['id.resp_p'] < 10)]

In [None]:
dyn_ports = conn_log_df[(conn_log_df['id.resp_p'] >= 49151)]

In [None]:
print(len(conn_log_df))
print(len(low_ports)) # < 10
print(len(filtered_df)) # well-known 10-1024
print(len(reg_ports)) # 1024-49151
print(len(dyn_ports)) # >49151
print(f"{round( len(low_ports) / len(conn_log_df)*100,2 )}% to low ports <10")
print(f"{round( len(filtered_df) / len(conn_log_df)*100,2 )}% to well-known ports 10-1024") # well-known
print(f"{round( len(reg_ports) / len(conn_log_df)*100,2 )}% to registered ports 1024-49151")
print(f"{round( len(dyn_ports) / len(conn_log_df)*100,2 )} % to dynamic ports >49151")