In [1]:
import duckdb
#import json

### Repair JSON input

In [2]:
# valid_entries = []
# with open("nextcloud.log.json", "r", encoding="utf-8") as f:
#     for line in f:
#         try:
#             # Process each line as a JSON object
#             valid_entries.append(json.loads(line))  
#         except json.JSONDecodeError:
#             print("Skipping malformed line")

# # Save the repaired JSON
# with open("fixed_nextcloud.log.json", "w", encoding="utf-8") as f:
#     json.dump(valid_entries, f, separators=(',', ':'))

### Ingest and group data

In [3]:
%%time

json_input = duckdb.read_json("fixed_nextcloud.log.json")

domain_errors = duckdb.sql("""
                       SELECT *
                       FROM json_input
                       WHERE message LIKE '%Trusted domain error%'
                       -- LIMIT 10
                           """)

gby_user_agent = duckdb.sql("""
                        SELECT CAST(remoteAddr AS INET) AS remoteAddr_inet,
                        COUNT(*) AS count
                        FROM domain_errors
                        WHERE userAgent LIKE '%Macintosh%'
                        GROUP BY remoteAddr
                        ORDER BY count DESC
                             """)

print(gby_user_agent)

┌─────────────────┬───────┐
│ remoteAddr_inet │ count │
│      inet       │ int64 │
├─────────────────┼───────┤
│ 156.38.245.18   │  7102 │
│ 103.121.39.54   │  1323 │
│ 3.236.57.159    │  1312 │
│ 185.231.154.40  │   760 │
│ 185.134.23.83   │   229 │
│ 199.127.60.228  │   148 │
│ 141.98.11.107   │   136 │
│ 78.153.140.179  │    90 │
│ 148.153.45.234  │    83 │
│ 45.149.241.33   │    70 │
│       ·         │     · │
│       ·         │     · │
│       ·         │     · │
│ 64.62.197.241   │     1 │
│ 64.62.197.161   │     1 │
│ 65.49.1.63      │     1 │
│ 123.160.221.133 │     1 │
│ 64.62.197.203   │     1 │
│ 134.209.126.46  │     1 │
│ 204.48.23.174   │     1 │
│ 107.150.117.103 │     1 │
│ 35.92.177.40    │     1 │
│ 44.234.45.97    │     1 │
├─────────────────┴───────┤
│  1073 rows (20 shown)   │
└─────────────────────────┘

CPU times: user 4.99 s, sys: 460 ms, total: 5.45 s
Wall time: 5.36 s


### Read IP database and join to group

In [4]:
%%time

ipv4_city = duckdb.sql("""
                SELECT * 
                FROM read_csv('dbip-city-lite-2025-02.csv',
                               columns = {
                               'ip_start': 'VARCHAR(15)',
                               'ip_end': 'VARCHAR(15)', 
                               'continent': 'VARCHAR(2)',
                               'country': 'VARCHAR(2)',
                               'stateprov': 'TEXT',
                               'city': 'TEXT',
                               'latitude': 'FLOAT',
                               'longitude': 'FLOAT'
                               }, 
                               header = False,
                               ignore_errors = true)
                WHERE ip_start LIKE '%.%'
                      """)

ipv4_city_subset = duckdb.sql("""
                        SELECT CAST(ip_start as INET) AS ip_start_inet, 
                               CAST(ip_end as INET) AS ip_end_inet,
                               country,
                               stateprov,
                               city
                        FROM ipv4_city
                        -- LIMIT 10
                            """)

gby_user_agent_city_join = duckdb.sql("""
                                SELECT gua.remoteAddr_inet,
                                       gua.count,
                                       ics.country,
                                       ics.stateprov,
                                       ics.city
                                FROM gby_user_agent gua
                                JOIN ipv4_city_subset ics
                                ON gua.remoteAddr_inet
                                BETWEEN ics.ip_start_inet AND ics.ip_end_inet
                                ORDER BY count DESC
                                """)

print(gby_user_agent_city_join)

┌─────────────────┬───────┬─────────┬───────────────────────┬───────────────────────────────┐
│ remoteAddr_inet │ count │ country │       stateprov       │             city              │
│      inet       │ int64 │ varchar │        varchar        │            varchar            │
├─────────────────┼───────┼─────────┼───────────────────────┼───────────────────────────────┤
│ 156.38.245.18   │  7102 │ ZA      │ Gauteng               │ Johannesburg                  │
│ 103.121.39.54   │  1323 │ BD      │ Chittagong            │ Chittagong (Chawk Bazar)      │
│ 3.236.57.159    │  1312 │ US      │ Virginia              │ Ashburn                       │
│ 185.231.154.40  │   760 │ RU      │ Moscow                │ Moscow                        │
│ 185.134.23.83   │   229 │ GB      │ England               │ London                        │
│ 141.98.11.107   │   136 │ LT      │ Kaunas                │ Kaunas                        │
│ 78.153.140.179  │    90 │ GB      │ England               

In [52]:
#print(ipv4_city_subset)