In [74]:
import json
import pandas as pd
from pathlib import Path

In [75]:
# load all jsons
folder_path = Path("data/peerlists")
all_peer_lists = []
all_peers = []

for json_file in folder_path.glob("*.json"):
   with open(json_file, 'r') as f:
       data = json.load(f)
   
   # Collect peer lists (without peers column for now)
   for peer_list in data['peer_lists']:
       peer_list_meta = {k: v for k, v in peer_list.items() if k != 'peers'}
       peer_list_meta['file_source'] = json_file.name  # Track which file
       peer_list_meta['length'] = len(peer_list['peers'])
       all_peer_lists.append(peer_list_meta)
       
       # Collect individual peers with source context
       for peer in peer_list['peers']:
           peer_record = peer.copy()
           peer_record['source_ip'] = peer_list['source_ip']
           peer_record['timestamp'] = peer_list['timestamp']
           peer_record['list_identifier'] = peer_list['source_ip'] + "_" + peer_list['timestamp']
           all_peers.append(peer_record)

# Create DataFrames
peer_lists_df = pd.DataFrame(all_peer_lists)
peers_df = pd.DataFrame(all_peers)

# Convert numeric columns
peer_lists_df['cumulative_difficulty'] = pd.to_numeric(peer_lists_df['cumulative_difficulty'])
peer_lists_df['current_height'] = pd.to_numeric(peer_lists_df['current_height'])

In [76]:
print(peer_lists_df.info())
print(peers_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37864 entries, 0 to 37863
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   source_ip                    37864 non-null  object 
 1   timestamp                    37864 non-null  object 
 2   cumulative_difficulty        33315 non-null  object 
 3   cumulative_difficulty_top64  33268 non-null  object 
 4   current_height               33315 non-null  float64
 5   top_version                  33315 non-null  float64
 6   file_source                  37864 non-null  object 
 7   length                       37864 non-null  int64  
 8   pruning_seed                 12723 non-null  float64
 9   current_blockchain_height    1 non-null      object 
dtypes: float64(3), int64(1), object(6)
memory usage: 2.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578057 entries, 0 to 1578056
Data columns (total 15 columns):
 #   Column    

In [77]:
print(peer_lists_df['length'].mean())

41.67697549123178


In [78]:
# Group by source_ip and collect unique peer IPs
unique_peers_by_source = peers_df.groupby('source_ip')['ip'].apply(lambda x: x.unique().tolist()).reset_index()
unique_peers_by_source.columns = ['source_ip', 'unique_peer_ips']

# Optional: Add count of unique peers per source
unique_peers_by_source['peer_count'] = unique_peers_by_source['unique_peer_ips'].apply(len)

peerlists_unique_indexed_df = unique_peers_by_source.set_index('source_ip')

In [80]:
for n in range(255):
    try:
        print(peerlists_unique_indexed_df.loc[f"193.142.4.{n}",'unique_peer_ips'])
        print(n)
    except:
        continue

['209.222.252.189', '23.92.36.5', '209.222.252.132', '209.222.252.85', '162.218.65.87', '162.218.65.95', '209.222.252.175', '209.222.252.167', '209.222.252.45', '91.198.115.220', '162.218.65.89', '91.198.115.241', '209.222.252.96', '209.222.252.184', '162.218.65.27', '162.218.65.77', '91.198.115.125', '91.198.115.35', '162.218.65.101', '209.222.252.77', '23.92.36.32', '23.92.36.53', '91.198.115.34', '91.198.115.62', '209.222.252.139', '162.218.65.192', '91.198.115.32', '91.198.115.133', '91.198.115.37', '91.198.115.82', '162.218.65.52', '91.198.115.172', '91.198.115.42', '91.198.115.142', '209.222.252.187', '23.92.36.33', '162.218.65.44', '162.218.65.29', '162.218.65.137', '209.222.252.207', '209.222.252.95', '91.198.115.195', '162.218.65.217', '209.222.252.224', '209.222.252.217', '91.198.115.247', '91.198.115.167', '162.218.65.8', '162.218.65.171', '91.198.115.144', '209.222.252.53', '162.218.65.47', '162.218.65.109', '209.222.252.156', '209.222.252.136', '91.198.115.108', '162.218.6

In [81]:
# Analyze sizes
peer_counts = unique_peers_by_source.set_index('source_ip')['peer_count']
print("Peer list size statistics:")
print(peer_counts.describe())
print(f"\nTop 5 sources by peer count:")
print(peer_counts.nsmallest(5))

# Analyze overlap between sources
from itertools import combinations

# Convert to dict for easier access
peer_sets = {row['source_ip']: set(row['unique_peer_ips']) 
            for _, row in unique_peers_by_source.iterrows()}

# Calculate pairwise overlaps
overlaps = []
for source1, source2 in combinations(peer_sets.keys(), 2):
   intersection = len(peer_sets[source1] & peer_sets[source2])
   union = len(peer_sets[source1] | peer_sets[source2])
   jaccard = intersection / union if union > 0 else 0
   
   overlaps.append({
       'source1': source1, 'source2': source2,
       'intersection': intersection, 'jaccard_similarity': jaccard
   })

overlap_df = pd.DataFrame(overlaps)
print(f"\nHighest overlaps:")
print(overlap_df.nlargest(5, 'intersection')[['source1', 'source2', 'intersection', 'jaccard_similarity']])

Peer list size statistics:
count    1968.000000
mean      673.398882
std       401.814734
min         1.000000
25%       346.750000
50%       720.500000
75%       920.250000
max      3670.000000
Name: peer_count, dtype: float64

Top 5 sources by peer count:
source_ip
174.3.157.168     1
172.81.177.185    2
216.203.98.124    2
45.137.99.166     2
46.234.195.230    3
Name: peer_count, dtype: int64

Highest overlaps:
                 source1         source2  intersection  jaccard_similarity
100034   104.155.200.188   34.151.110.94          2231            0.531317
1508178     34.146.205.9   34.151.110.94          2195            0.541975
100278   104.155.200.188   52.73.168.104          2142            0.535902
1193184    193.168.143.9  194.208.127.43          2126            0.706547
482188     146.70.133.25   52.73.168.104          2080            0.455741


In [82]:
import ipaddress

def ip_to_subnet(ip):
   try:
       return str(ipaddress.IPv4Network(f"{ip}/24", strict=False).network_address) + "/24"
   except:
       return None

# Add subnet analysis to the dataframe
unique_peers_by_source['unique_subnets'] = unique_peers_by_source['unique_peer_ips'].apply(
   lambda ip_list: list(set([ip_to_subnet(ip) for ip in ip_list if ip_to_subnet(ip) is not None]))
)

unique_peers_by_source['subnet_count'] = unique_peers_by_source['unique_subnets'].apply(len)

# Calculate the ratio: unique peers / unique subnets
unique_peers_by_source['peers_per_subnet_ratio'] = (
   unique_peers_by_source['peer_count'] / unique_peers_by_source['subnet_count']
)

In [83]:
unique_peers_by_source.info()
print(unique_peers_by_source['peers_per_subnet_ratio'].max())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1968 entries, 0 to 1967
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   source_ip               1968 non-null   object 
 1   unique_peer_ips         1968 non-null   object 
 2   peer_count              1968 non-null   int64  
 3   unique_subnets          1968 non-null   object 
 4   subnet_count            1968 non-null   int64  
 5   peers_per_subnet_ratio  1968 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 92.4+ KB
205.25


In [84]:
all_peer_subnets = []
for _, row in unique_peers_by_source.iterrows():
   for ip in row['unique_peer_ips']:
       subnet = ip_to_subnet(ip)
       if subnet:
           all_peer_subnets.append({'ip': ip, 'subnet': subnet, 'source_ip': row['source_ip']})

peer_subnet_df = pd.DataFrame(all_peer_subnets)

# Count unique peers per subnet (across all sources)
peers_per_subnet = peer_subnet_df.groupby('subnet')['ip'].nunique().reset_index()
peers_per_subnet.columns = ['subnet', 'unique_peer_count']

# Sort by count to see most populated subnets
peers_per_subnet = peers_per_subnet.sort_values('unique_peer_count', ascending=False)

print("Top 10 most populated subnets:")
print(peers_per_subnet.head(10))

Top 10 most populated subnets:
                subnet  unique_peer_count
6868   91.198.115.0/24                254
1357   162.218.65.0/24                254
2916  209.222.252.0/24                254
2461    193.142.4.0/24                253
2685   199.116.84.0/24                253
22      100.42.27.0/24                253
3210     23.92.36.0/24                 60
5864     80.78.21.0/24                 15
7042    93.113.25.0/24                 14
6966    92.116.61.0/24                 13
