In [1]:
import duckdb
#import json

### Repair JSON input

In [2]:
valid_entries = []
with open("nextcloud.log.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            # Process each line as a JSON object
            valid_entries.append(json.loads(line))  
        except json.JSONDecodeError:
            print("Skipping malformed line")

# Save the repaired JSON
with open("fixed_nextcloud.log.json", "w", encoding="utf-8") as f:
    json.dump(valid_entries, f, separators=(',', ':'))

### Ingest and group data

In [29]:
%%time

json_input = duckdb.read_json("fixed_nextcloud.log.json")

domain_errors = duckdb.sql("""
                       SELECT *
                       FROM json_input
                       WHERE message LIKE '%Trusted domain error%'
                       -- LIMIT 10
                           """)

gby_user_agent = duckdb.sql("""
                        SELECT CAST(remoteAddr AS INET) AS remoteAddr_inet,
                        COUNT(*) AS count
                        FROM domain_errors
                        WHERE userAgent LIKE '%bot%'
                        GROUP BY remoteAddr
                        ORDER BY count DESC
                             """)

print(gby_user_agent)

┌─────────────────┬───────┐
│ remoteAddr_inet │ count │
│      inet       │ int64 │
├─────────────────┼───────┤
│ 199.195.250.80  │    29 │
│ 83.147.52.42    │     4 │
│ 87.120.115.119  │     2 │
│ 179.43.143.42   │     2 │
│ 78.153.140.177  │     2 │
│ 78.153.140.179  │     2 │
│ 178.215.236.132 │     1 │
│ 88.214.26.9     │     1 │
│ 54.144.96.170   │     1 │
│ 3.215.185.65    │     1 │
│ 104.140.148.46  │     1 │
│ 87.251.66.220   │     1 │
│ 78.153.140.151  │     1 │
│ 18.212.3.249    │     1 │
│ 194.37.80.250   │     1 │
│ 3.82.53.117     │     1 │
│ 87.120.115.34   │     1 │
│ 94.156.64.214   │     1 │
│ 76.8.60.182     │     1 │
│ 174.138.62.1    │     1 │
│ 207.241.235.133 │     1 │
├─────────────────┴───────┤
│ 21 rows       2 columns │
└─────────────────────────┘

CPU times: user 4.94 s, sys: 486 ms, total: 5.43 s
Wall time: 5.42 s


In [32]:
# domain_errors = duckdb.sql("""
#                        SELECT time, remoteAddr, userAgent
#                        FROM json_input
#                        WHERE message LIKE '%Trusted domain error%'
#                        AND userAgent LIKE '%bot%'
#                        --- LIMIT 10
#                            """)

# print(domain_errors)

In [33]:
# login_errors = duckdb.sql("""
#                        SELECT time, remoteAddr, message, 
#                        FROM json_input
#                        WHERE message LIKE '%Login failed%'
#                        AND message NOT LIKE '%vnegi10%'
#                        AND message NOT LIKE '%mdash%'
#                        --- LIMIT 10
#                            """)

# print(login_errors)

### Read IP database and join to group

In [23]:
%%time

ipv4_city = duckdb.sql("""
                SELECT * 
                FROM read_csv('dbip-city-lite-2025-02.csv',
                               columns = {
                               'ip_start': 'VARCHAR(15)',
                               'ip_end': 'VARCHAR(15)', 
                               'continent': 'VARCHAR(2)',
                               'country': 'VARCHAR(2)',
                               'stateprov': 'TEXT',
                               'city': 'TEXT',
                               'latitude': 'FLOAT',
                               'longitude': 'FLOAT'
                               }, 
                               header = False,
                               ignore_errors = true)
                WHERE ip_start LIKE '%.%'
                      """)

ipv4_city_subset = duckdb.sql("""
                        SELECT CAST(ip_start as INET) AS ip_start_inet, 
                               CAST(ip_end as INET) AS ip_end_inet,
                               country,
                               stateprov,
                               city
                        FROM ipv4_city
                        -- LIMIT 10
                            """)

gby_user_agent_city_join = duckdb.sql("""
                                SELECT gua.remoteAddr_inet,
                                       gua.count,
                                       ics.country,
                                       ics.stateprov,
                                       ics.city
                                FROM gby_user_agent gua
                                JOIN ipv4_city_subset ics
                                ON gua.remoteAddr_inet
                                BETWEEN ics.ip_start_inet AND ics.ip_end_inet
                                ORDER BY count DESC
                                """)

print(gby_user_agent_city_join)

┌─────────────────┬───────┬─────────┬──────────────────┬──────────────────────────────┐
│ remoteAddr_inet │ count │ country │    stateprov     │             city             │
│      inet       │ int64 │ varchar │     varchar      │           varchar            │
├─────────────────┼───────┼─────────┼──────────────────┼──────────────────────────────┤
│ 115.159.220.67  │  1610 │ CN      │ Shanghai         │ Shanghai                     │
│ 91.250.240.220  │   166 │ PT      │ Lisbon           │ Lisbon                       │
│ 20.163.106.32   │   148 │ US      │ Arizona          │ Phoenix                      │
│ 94.102.61.7     │   116 │ NL      │ North Holland    │ Amsterdam                    │
│ 185.242.226.109 │    67 │ NL      │ North Holland    │ Amsterdam                    │
│ 185.242.226.88  │    54 │ NL      │ North Holland    │ Amsterdam                    │
│ 130.211.54.158  │    38 │ BE      │ Brussels Capital │ Brussels                     │
│ 34.140.248.32   │    38 │ BE  

### Read ASN database and join to group

### Performance - Full query

In [28]:
# %%time

# json_input = duckdb.read_json("fixed_nextcloud.log.json")

# domain_errors = duckdb.sql("""
#                        SELECT *
#                        FROM json_input
#                        WHERE message LIKE '%Trusted domain error%'
#                        -- LIMIT 10
#                            """)

# gby_user_agent = duckdb.sql("""
#                         SELECT CAST(remoteAddr AS INET) AS remoteAddr_inet,
#                         COUNT(*) AS count
#                         FROM domain_errors
#                         WHERE userAgent LIKE '%Mozilla%'
#                         GROUP BY remoteAddr
#                         ORDER BY count DESC
#                              """)

# ipv4_city = duckdb.sql("""
#                 SELECT * 
#                 FROM read_csv('dbip-city-lite-2025-02.csv',
#                                columns = {
#                                'ip_start': 'VARCHAR(15)',
#                                'ip_end': 'VARCHAR(15)', 
#                                'continent': 'VARCHAR(2)',
#                                'country': 'VARCHAR(2)',
#                                'stateprov': 'TEXT',
#                                'city': 'TEXT',
#                                'latitude': 'FLOAT',
#                                'longitude': 'FLOAT'
#                                }, 
#                                header = False,
#                                ignore_errors = true)
#                 WHERE ip_start LIKE '%.%'
#                       """)

# ipv4_city_subset = duckdb.sql("""
#                         SELECT CAST(ip_start as INET) AS ip_start_inet, 
#                                CAST(ip_end as INET) AS ip_end_inet,
#                                country,
#                                stateprov,
#                                city
#                         FROM ipv4_city
#                         -- LIMIT 10
#                             """)

# gby_user_agent_city_join = duckdb.sql("""
#                                 SELECT gua.remoteAddr_inet,
#                                        gua.count,
#                                        ics.country,
#                                        ics.stateprov,
#                                        ics.city
#                                 FROM gby_user_agent gua
#                                 JOIN ipv4_city_subset ics
#                                 ON gua.remoteAddr_inet
#                                 BETWEEN ics.ip_start_inet AND ics.ip_end_inet
#                                 ORDER BY count DESC
#                                 """)

# print(gby_user_agent_city_join)