In [15]:
!wget -nc https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json

File ‘enterprise-attack.json’ already there; not retrieving.



In [16]:
from bs4 import BeautifulSoup
from collections import defaultdict
import datetime
from enum import Enum
from fake_useragent import UserAgent
import feedparser
import iocextract as ie
import json
from mitreattack.stix20 import MitreAttackData
import nltk
import os
import random
import re
import requests
import sqlite3
import time
import uuid
from whoosh import index
from whoosh import fields
from whoosh import qparser

In [17]:
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package punkt to /home/udp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/udp/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [18]:
def get_external_ids(elems):
    return [r.external_id for e in elems for r in e.external_references if r.source_name == 'mitre-attack']
        
def get_names(elems):
    return [e.name for e in elems]

# Load mitre attack data
mitre_attack_data = MitreAttackData("enterprise-attack.json")

tactics = mitre_attack_data.get_tactics()
techniques = mitre_attack_data.get_techniques()
groups = mitre_attack_data.get_groups()
software = mitre_attack_data.get_software()
campaigns = mitre_attack_data.get_campaigns()
datasources = mitre_attack_data.get_datasources()

tactics_ids = set(get_external_ids(tactics))
techniques_ids = set(get_external_ids(techniques))
groups_names = set(get_names(groups))
groups_ids = set(get_external_ids(groups))
software_names = set(get_names(software))
software_ids = set(get_external_ids(software))
campaign_names = set(get_names(campaigns))
campaign_ids = set(get_external_ids(campaigns))
datasources_ids = set(get_external_ids(datasources))

mitre_data = {
    "tactics": tactics_ids,
    "techniques": techniques_ids,
    "group_names": groups_names,
    "group_ids": groups_ids,
    "software_ids": software_ids,
    "campaign_names": campaign_names,
    "campaign_ids": campaign_ids,
    "datasources": datasources_ids,
}

In [19]:
class ReportType(Enum):
    HTML = 1
    PDF = 2
    TEXT = 3

In [20]:
class Report:
    cve_re = re.compile(r"\bCVE-\d{4}-\d{4,}\b", re.IGNORECASE)

    summary = None
    ipv4s = []
    ipv6s = []
    sha256s = []
    md5s = []
    sha1s = []
    domains = []
    yara_rules = []
    cves = []
    mitre = dict()
        
    def __init__(self, contents, title, body, report_type, source, publish_time, url=None):
        self.contents = contents # original contents
        self.title = title
        self.body = body
        self.url = url
        self.report_type = report_type
        self.source = source
        self.publish_time = publish_time

        self.upload_time = time.time()

    @classmethod
    def from_url(cls, url, source, publish_time):
        headers = {
            "User-Agent": UserAgent().random,
        }
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, 'html.parser')
        body = soup.find('body')
        title = soup.find('title')

        return Report(resp.text, title, body.text, ReportType.HTML, source, publish_time, url=url)

    def parse_iocs(self):
        self.domains = list(set(ie.extract_urls(self.body, no_scheme=True)))
        self.ipv4s = list(set(ie.extract_ipv4s(self.body)))
        self.ipv6s = list(set(ie.extract_ipv6s(self.body)))
        self.cves = list(self.cve_re.findall(self.body))
        self.sha1s = list(set(ie.extract_sha1_hashes(self.body)))
        self.md5s = list(set(ie.extract_md5_hashes(self.body)))
        self.sha256s = list(set(ie.extract_sha256_hashes(self.body)))
        self.yara_rules = list(set(ie.extract_yara_rules(self.body)))

        ref_ipv4s = [ie.refang_ipv4(ipv4) for ipv4 in self.ipv4s]
        ref_domains = [ie.refang_data(domain, no_scheme=True) for domain in self.domains]

        for i in range(len(self.ipv4s)):
            self.body = self.body.replace(self.ipv4s[i], ref_ipv4s[i])

        for i in range(len(self.domains)):
            self.body = self.body.replace(self.domains[i], ref_domains[i])

        self.ipv4s = ref_ipv4s
        self.domains = ref_domains

    def parse_mitre(self):
        # Tokenize contents
        words = set(nltk.corpus.words.words())
        tokens = set(nltk.tokenize.word_tokenize(self.body)).difference(words)
        for key, values in mitre_data.items():
            self.mitre[key] = list(tokens.intersection(values))

In [21]:
def persist_report(report, report_id, conn, conflict="REPLACE"):
    r = report

    report_data_url = f"pagedata/{r.source}/{report_id}.html"
    
    if not os.path.exists(report_data_url):
        os.makedirs(os.path.dirname(report_data_url), exist_ok=True)
        with open(report_data_url, "w") as f:
            f.write(r.contents)

    data = {
        "id": report_id,
        "publish_time": r.publish_time,
        "title": r.title,
        "summary": r.summary,
        "upload_time": r.upload_time,
        "source": r.source,
        "ipv4s": ",".join(r.ipv4s),
        "ipv6s": ",".join(r.ipv6s),
        "urls": ",".join(r.domains),
        "yara_rules": json.dumps(list(r.yara_rules)),
        "cves": ",".join(r.cves),
        "sha256s": ",".join(r.sha256s),
        "md5s": ",".join(r.md5s),
        "sha1s": ",".join(r.sha1s),
        "mitre": json.dumps(r.mitre),
        "report_type": r.report_type.name,
        "web_url": r.url,
        "report_data_url": report_data_url,
    }

    try:
        with conn:
            conn.execute(f"""
                INSERT OR {conflict} INTO report VALUES(
                    :id,
                    :publish_time,
                    :title,
                    :summary,
                    :upload_time,
                    :source,
                    :ipv4s,
                    :ipv6s,
                    :urls,
                    :yara_rules,
                    :cves,
                    :sha256s,
                    :md5s,
                    :sha1s,
                    :mitre,
                    :report_type,
                    :web_url,
                    :report_data_url
                )
            """, data)
    except Exception as e:
        os.remove(report_data_url)
        raise e

In [22]:
def index_report(report, report_id, ixwriter):
    ixwriter.update_document(id=report_id, title=report.title, content=report.body)

In [23]:
def process_feed(feed, conn, ixwriter, skip_existing=True):
    source = feed.feed.title
    print(f"********** Reading {len(feed.entries)} posts from {source} **********")

    for entry in feed.entries:
        with conn:
            row = conn.execute("SELECT * FROM report WHERE web_url = ?", (entry.link,)).fetchone()
            if row and skip_existing:
                continue

        publish_time = 0
        if entry.published_parsed:
            publish_time = time.mktime(entry.published_parsed)
        try:
            r = Report.from_url(url=entry.link, source=source, publish_time=publish_time)
        except Exception as e:
            print(f"Error reading {entry.link}, skipping: {e}")
            continue
            
        if 'title' in entry:
            r.title = entry.title
        if 'summary' in entry:
            r.summary = entry.summary

        r.parse_iocs()
        r.parse_mitre()

        report_id = row[0] if row else str(uuid.uuid4())
        persist_report(r, report_id, conn)
        index_report(r, report_id, ixwriter)
        
    print(f"********** Persisted {len(feed.entries)} posts from {source} **********")

In [24]:
def main():
    # Connect to sqlite database
    conn = sqlite3.connect("reports.db")
    
    # Initialize database
    with open("sqlite_schema.sql") as f:
        try:
            with conn:
                conn.executescript(f.read())
        except Exception as e:
            print(f"error applying schema: {e}")

    # Create or reset index
    index_dir = "pageindex"
    os.makedirs(index_dir, exist_ok=True)

    ixschema = fields.Schema(id=fields.ID(stored=True, unique=True), title=fields.TEXT, content=fields.TEXT(stored=True))
    try:
        ix = index.open_dir(index_dir, schema=ixschema, indexname="report")
    except index.EmptyIndexError:
        ix = index.create_in(index_dir, schema=ixschema, indexname="report")
        
    ixwriter = ix.writer()

    try:
        with open("feeds.txt") as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                if line.startswith("#"):
                    continue
                feed = feedparser.parse(line)
                if feed.bozo:
                    print(f"BOZO {line}: {feed.bozo_exception}")
                    continue
                
                process_feed(feed, conn, ixwriter, skip_existing=False)
                print(f"Processed feed {line}")
    finally:
        ixwriter.commit()
        conn.close()

In [25]:
if __name__ == '__main__':
    main()

********** Reading 100 posts from WeLiveSecurity **********
********** Persisted 100 posts from WeLiveSecurity **********
Processed feed https://www.welivesecurity.com/en/rss/feed/
********** Reading 16 posts from abuse.ch | IT-Security Blog **********
********** Persisted 16 posts from abuse.ch | IT-Security Blog **********
Processed feed https://abuse.ch/rss
********** Reading 15 posts from Check Point Research **********
********** Persisted 15 posts from Check Point Research **********
Processed feed https://research.checkpoint.com/feed/
********** Reading 10 posts from Sophos News **********
********** Persisted 10 posts from Sophos News **********
Processed feed https://news.sophos.com/en-us/feed/
********** Reading 10 posts from SentinelOne **********
********** Persisted 10 posts from SentinelOne **********
Processed feed https://www.sentinelone.com/feed/
********** Reading 15 posts from Unit 42 **********
********** Persisted 15 posts from Unit 42 **********
Processed feed htt