In [1]:
!wget -nc https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json

File ‘enterprise-attack.json’ already there; not retrieving.



In [2]:
from bs4 import BeautifulSoup
from collections import defaultdict
import datetime
from enum import Enum
from fake_useragent import UserAgent
import feedparser
import iocextract as ie
import json
from mitreattack.stix20 import MitreAttackData
import nltk
import os
import random
import re
import requests
import sqlite3
import time
import uuid

In [3]:
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package punkt to /home/udp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/udp/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
def get_external_ids(elems):
    return [r.external_id for e in elems for r in e.external_references if r.source_name == 'mitre-attack']
        
def get_names(elems):
    return [e.name for e in elems]

# Load mitre attack data
mitre_attack_data = MitreAttackData("enterprise-attack.json")

tactics = mitre_attack_data.get_tactics()
techniques = mitre_attack_data.get_techniques()
groups = mitre_attack_data.get_groups()
software = mitre_attack_data.get_software()
campaigns = mitre_attack_data.get_campaigns()
datasources = mitre_attack_data.get_datasources()

tactics_ids = set(get_external_ids(tactics))
techniques_ids = set(get_external_ids(techniques))
groups_names = set(get_names(groups))
groups_ids = set(get_external_ids(groups))
software_names = set(get_names(software))
software_ids = set(get_external_ids(software))
campaign_names = set(get_names(campaigns))
campaign_ids = set(get_external_ids(campaigns))
datasources_ids = set(get_external_ids(datasources))

mitre_data = {
    "tactics": tactics_ids,
    "techniques": techniques_ids,
    "group_names": groups_names,
    "group_ids": groups_ids,
    "software_ids": software_ids,
    "campaign_names": campaign_names,
    "campaign_ids": campaign_ids,
    "datasources": datasources_ids,
}

In [5]:
class ReportType(Enum):
    HTML = 1
    PDF = 2
    TEXT = 3

In [6]:
class Report:
    cve_re = re.compile(r"\bCVE-\d{4}-\d{4,}\b", re.IGNORECASE)

    summary = None
    ipv4s = set()
    ipv6s = set()
    sha256s = set()
    md5s = set()
    sha1s = set()
    domains = set()
    yara_rules = set()
    cves = set()
    mitre = dict()
        
    def __init__(self, contents, title, body, report_type, source, publish_time, url=None):
        self.contents = contents # original contents
        self.title = title
        self.body = body
        self.url = url
        self.report_type = report_type
        self.source = source
        self.publish_time = publish_time

        self.upload_time = time.time()

    @classmethod
    def from_url(cls, url, source, publish_time):
        headers = {
            "User-Agent": UserAgent().random,
        }
        resp = requests.get(url, headers=headers, timeout=5)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, 'html.parser')
        body = soup.find('body')
        title = soup.find('title')

        return Report(resp.text, title, body.text, ReportType.HTML, source, publish_time, url=url)

    def parse_iocs(self):
        self.domains = set(ie.extract_urls(self.body))
        self.ipv4s = set(ie.extract_ipv4s(self.body))
        self.ipv6s = set(ie.extract_ipv6s(self.body))
        self.cves = self.cve_re.findall(self.body)
        self.sha1s = set(ie.extract_sha1_hashes(self.body))
        self.md5s = set(ie.extract_md5_hashes(self.body))
        self.sha256s = set(ie.extract_sha256_hashes(self.body))

    def parse_mitre(self):
        # Tokenize contents
        words = set(nltk.corpus.words.words())
        tokens = set(nltk.tokenize.word_tokenize(self.body)).difference(words)
        for key, values in mitre_data.items():
            self.mitre[key] = list(tokens.intersection(values))

In [7]:
def persist_report(report, report_id, conn, conflict="REPLACE"):
    r = report

    report_data_url = f"pagedata/{r.source}/{report_id}.html"
    if not os.path.exists(report_data_url):
        os.makedirs(os.path.dirname(report_data_url), exist_ok=True)
        with open(report_data_url, "w") as f:
            f.write(r.contents)
    
    data = {
        "id": report_id,
        "publish_time": datetime.datetime.fromtimestamp(r.publish_time).strftime("%Y-%m-%d %H:%M:%S"),
        "title": r.title,
        "summary": r.summary,
        "upload_time": datetime.datetime.fromtimestamp(r.upload_time).strftime("%Y-%m-%d %H:%M:%S"),
        "source": r.source,
        "ipv4s": ",".join(r.ipv4s),
        "ipv6s": ",".join(r.ipv6s),
        "urls": ",".join(r.domains),
        "yara_rules": json.dumps(list(r.yara_rules)),
        "cves": ",".join(r.cves),
        "sha256s": ",".join(r.sha256s),
        "md5s": ",".join(r.md5s),
        "sha1s": ",".join(r.sha1s),
        "mitre": json.dumps(r.mitre),
        "report_type": r.report_type.name,
        "web_url": r.url,
        "report_data_url": report_data_url,
    }

    try:
        with conn:
            conn.execute(f"""
                INSERT OR {conflict} INTO report VALUES(
                    :id,
                    :publish_time,
                    :title,
                    :summary,
                    :upload_time,
                    :source,
                    :ipv4s,
                    :ipv6s,
                    :urls,
                    :yara_rules,
                    :cves,
                    :sha256s,
                    :md5s,
                    :sha1s,
                    :mitre,
                    :report_type,
                    :web_url,
                    :report_data_url
                )
            """, data)
    except Exception as e:
        os.remove(report_data_url)
        raise e

In [12]:
def process_feed(feed, conn, skip_existing=True):
    source = feed.feed.title
    print(f"********** Reading {len(feed.entries)} posts from {source} **********")

    for entry in feed.entries:
        with conn:
            row = conn.execute("SELECT * FROM report WHERE web_url = ?", (entry.link,)).fetchone()
            if row:
                print(f"{entry.link} already processed. UUID: {row[0]}")
                if skip_existing:
                    continue
        
        publish_time = time.mktime(entry.published_parsed)
        try:
            r = Report.from_url(url=entry.link, source=source, publish_time=publish_time)
        except Exception as e:
            print(f"Error reading {entry.link}, skipping: {e}")
            continue
            
        if 'title' in entry:
            r.title = entry.title
        if 'summary' in entry:
            r.summary = entry.summary

        r.parse_iocs()
        r.parse_mitre()

        report_id = row[0] if row else str(uuid.uuid4())
        persist_report(r, report_id, conn)
        
    print(f"********** Persisted {len(feed.entries)} posts from {source} **********")

In [13]:
def main():
    # Connect to sqlite database
    conn = sqlite3.connect("reports.db")
    
    # Initialize database
    with open("schema.sql") as f:
        try:
            with conn:
                conn.executescript(f.read())
        except Exception as e:
            print(f"error applying schema: {e}")

    with open("feeds.txt") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line.startswith("#"):
                continue
            feed = feedparser.parse(line)
            if feed.bozo:
                print(f"BOZO {line}: {feed.bozo_exception}")
                continue
            
            process_feed(feed, conn)
            print(f"Processed feed {line}")
    
    conn.close()

In [15]:
main()

********** Reading 100 posts from WeLiveSecurity **********
https://www.welivesecurity.com/en/videos/fake-job-offers-target-coders-infostealers/ already processed. UUID: c99c7b4b-17cc-44fa-8630-d072c1d7fc4a
https://www.welivesecurity.com/en/scams/no-youre-not-fired-beware-job-termination-scams/ already processed. UUID: 75181982-1d55-43ee-aeee-f8fff78a4212
https://www.welivesecurity.com/en/we-live-science/katharine-hayhoe-most-important-climate-equation-starmus-highlights/ already processed. UUID: 631222db-af13-40c1-9f28-c20f4d6096af
https://www.welivesecurity.com/en/eset-research/deceptivedevelopment-targets-freelance-developers/ already processed. UUID: 5efef211-370e-4174-a2f7-2d14c9369a31
https://www.welivesecurity.com/en/kids-online/gaming-gambling-lifting-lid-in-game-loot-boxes/ already processed. UUID: e1e41d6a-3030-42ad-a1ea-14853b892764
https://www.welivesecurity.com/en/videos/what-is-penetration-testing-unlocked-403-cybersecurity-podcast-ep-10/ already processed. UUID: 3c037d15