In [21]:
import boto3
from botocore.exceptions import ClientError
from collections import defaultdict
import csv
import json
import ollama
import os
from pathlib import Path
from pydantic import BaseModel
import sqlite3
import tldextract

In [22]:
!wget -nc http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
!unzip -o top-1m.csv.zip

--2025-03-10 21:34:28--  http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
16.15.4.148, 52.219.220.128, 52.219.112.144, ...-1.amazonaws.com)... 
connected. to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|16.15.4.148|:80... 
HTTP request sent, awaiting response... 200 OK
Length: 12395958 (12M) [application/zip]
Saving to: ‘top-1m.csv.zip’


2025-03-10 21:34:30 (7.03 MB/s) - ‘top-1m.csv.zip’ saved [12395958/12395958]

Archive:  top-1m.csv.zip
  inflating: top-1m.csv              


In [23]:
def ioc_validation_prompt(ioc, ioc_type, context):
    return f"""
    Based on the following text, is {ioc} an {ioc_type}? Answer "true" or "false", with a single sentence reason.

    -----------
    {context}
    """

def ioc_relevance_prompt(ioc, ioc_type, context):
    return f"""
    Based on the following text, is the {ioc_type} {ioc} an indicator of compromise? Answer "true" or "false", with a single sentence reason.

    -----------
    {context}
    """

In [24]:
class Answer(BaseModel):
    answer: bool
    reason: str

In [25]:
def is_valid_ioc(ioc, ioc_type, context):
    resp = ollama.chat(
        model='qwen2.5:14b',
        messages=[
            {
                'role': 'user',
                'content': ioc_validation_prompt(ioc, ioc_type, context),
            },
        ],
        format=Answer.model_json_schema(),
    )
    answer = Answer.model_validate_json(resp.message.content)
    return answer

In [26]:
def is_relevant_ioc(ioc, ioc_type, context):
    resp = ollama.chat(
        model='qwen2.5:14b',
        messages=[
            {
                'role': 'user',
                'content': ioc_relevance_prompt(ioc, ioc_type, context),
            },
        ],
        format=Answer.model_json_schema(),
    )
    answer = Answer.model_validate_json(resp.message.content)
    return answer

In [27]:
def top_domains():
    res = set()
    with open('top-1m.csv') as f:
        csvreader = csv.reader(f)
        for num, domain in csvreader:
            num = int(num)
            res.add(domain)

    return res

def extract_tld_plus_one(url):
    ext = tldextract.extract(url)
    tld_plus_one = f"{ext.domain}.{ext.suffix}"
    return tld_plus_one

common_domains = top_domains()

In [30]:
def context(ioc, body):
    try:
        idx = body.index(ioc)
        return body[idx-1000:idx+len(ioc)+1000]
    except:
        return body

def split(iocs):
    return iocs.split(",") if iocs else []

def update_false_positives():
    # Connect to sqlite database
    sq_conn = sqlite3.connect("reports.db")
    
    # Initialize database
    with open("sqlite_schema.sql") as f:
        try:
            with sq_conn:
                sq_conn.executescript(f.read())
        except Exception as e:
            print(f"error applying schema: {e}")
    
    with sq_conn, open("sqlite_schema.sql") as f:
        try:
            sq_conn.executescript(f.read())
        except Exception as e:
            print(f"error applying schema: {e}")
    
        read_cur = sq_conn.cursor()
        update_cur = sq_conn.cursor()
    
        count = 0
        for row in read_cur.execute("SELECT id, source, ipv4s, ipv6s, urls FROM report"):
            report_id, source, ipv4s, ipv6s, urls = row
            
            pagepath = os.path.join("parseddata", source, f"{report_id}.txt")
            if os.path.exists(pagepath):
                invalid_iocs = dict()
                irrelevant_iocs = dict()
                
                body = open(pagepath).read()
    
                print(row)
    
                for ipv4 in split(ipv4s):
                    ctx = context(ipv4, body)
                    answer = is_valid_ioc(ipv4, "IPv4 Address", ctx)
                    if not answer.answer:
                        invalid_iocs[ipv4] = answer.reason
    
                for ipv6 in split(ipv6s):
                    ctx = context(ipv6, body)
                    answer = is_valid_ioc(ipv6, "IPv6 Address", ctx)
                    if not answer.answer:
                        invalid_iocs[ipv6] = answer.reason
    
                for url in split(urls):
                    tldp1 = extract_tld_plus_one(url)
    
                    # Skip common domains
                    if tldp1 in common_domains:
                        irrelevant_iocs[url] = "Common domain"
                        continue
                    
                    ctx = context(url, body)
                    ip_answer = is_valid_ioc(url, "IP Address", ctx)
                    domain_answer = is_valid_ioc(url, "Domain", ctx)
                    if not (ip_answer.answer or domain_answer.answer):
                        if not ip_answer.answer:
                            invalid_iocs[url] = ip_answer.reason
                        if not domain_answer.answer:
                            invalid_iocs[url] = domain_answer.reason
    
                update_cur.execute("""
                    UPDATE report SET
                        ai_invalid_iocs = ?,
                        ai_irrelevant_iocs = ?
                    WHERE id = ?
                """, (json.dumps(invalid_iocs),
                      json.dumps(irrelevant_iocs),
                      report_id,))
                if update_cur.rowcount != 1:
                    raise Exception(f"update_cur.rowcount = {update_cur.rowcount}")
    
                count += 1
                print(f"Updated report {report_id} ({count})")

In [None]:
if __name__ == '__main__':
    update_false_positives()