In [17]:
!wget -nc https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json

File ‘enterprise-attack.json’ already there; not retrieving.



In [138]:
from bs4 import BeautifulSoup
from collections import defaultdict
import datetime
from enum import Enum
from fake_useragent import UserAgent
import feedparser
import iocextract as ie
import json
from mitreattack.stix20 import MitreAttackData
import nltk
import os
import random
import re
import requests
import sqlite3
import time
import uuid

In [19]:
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package punkt to /home/udp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/udp/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [27]:
class ReportType(Enum):
    HTML = 1
    PDF = 2
    TEXT = 3

In [86]:
class Report:
    cve_re = re.compile(r"\bCVE-\d{4}-\d{4,}\b", re.IGNORECASE)

    summary = None
    ipv4s = set()
    ipv6s = set()
    sha256s = set()
    md5s = set()
    sha1s = set()
    domains = set()
    yara_rules = set()
    cves = set()
    mitre = defaultdict(set)
        
    def __init__(self, contents, title, body, report_type, source, publish_time, url=None):
        self.contents = contents # original contents
        self.title = title
        self.body = body
        self.url = url
        self.report_type = report_type
        self.source = source
        self.publish_time = publish_time

        self.upload_time = time.time()

    @classmethod
    def from_url(cls, url, source, publish_time):
        headers = {
            "User-Agent": UserAgent().random,
        }
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, 'html.parser')
        body = soup.find('body')
        title = soup.find('title')

        return Report(resp.text, title, body.text, ReportType.HTML, source, publish_time, url=url)

    def parse_iocs(self):
        self.domains = set(ie.extract_urls(self.body))
        self.ipv4s = set(ie.extract_ipv4s(self.body))
        self.ipv6s = set(ie.extract_ipv6s(self.body))
        self.cves = self.cve_re.findall(self.body)
        self.sha1s = set(ie.extract_sha1_hashes(self.body))
        self.md5s = set(ie.extract_md5_hashes(self.body))
        self.sha256s = set(ie.extract_sha256_hashes(self.body))

    def parse_mitre(self):
        def get_external_ids(elems):
            return [r.external_id for e in elems for r in e.external_references if r.source_name == 'mitre-attack']
        
        def get_names(elems):
            return [e.name for e in elems]

        # Load mitre attack data
        mitre_attack_data = MitreAttackData("enterprise-attack.json")

        tactics = mitre_attack_data.get_tactics()
        techniques = mitre_attack_data.get_techniques()
        groups = mitre_attack_data.get_groups()
        software = mitre_attack_data.get_software()
        campaigns = mitre_attack_data.get_campaigns()
        datasources = mitre_attack_data.get_datasources()

        tactics_ids = set(get_external_ids(tactics))
        techniques_ids = set(get_external_ids(techniques))
        groups_names = set(get_names(groups))
        groups_ids = set(get_external_ids(groups))
        software_names = set(get_names(software))
        software_ids = set(get_external_ids(software))
        campaign_names = set(get_names(campaigns))
        campaign_ids = set(get_external_ids(campaigns))
        datasources_ids = set(get_external_ids(datasources))

        mitre_data = {
            "tactics": tactics_ids,
            "techniques": techniques_ids,
            "mitigations": mitigations_ids,
            "group_names": groups_names,
            "group_ids": groups_ids,
            "software_ids": software_ids,
            "campaign_names": campaign_names,
            "campaign_ids": campaign_ids,
            "datasources": datasources_ids,
        }

        # Tokenize contents
        words = set(nltk.corpus.words.words())
        tokens = set(nltk.tokenize.word_tokenize(self.body)).difference(words)
        for key, values in mitre_data.items():
            self.mitre[key] = tokens.intersection(values)

In [139]:
def persist_report(report, conn):
    r = report
    report_id = str(uuid.uuid4())

    report_data_url = f"pagedata/{report_id}.html"
    if not os.path.exists(report_data_url):
        with open(report_data_url, "w") as f:
            f.write(r.contents)
    
    data = {
        "id": report_id,
        "publish_time": datetime.datetime.fromtimestamp(r.publish_time).strftime("%Y-%m-%d %H:%M:%S"),
        "title": r.title,
        "summary": r.summary,
        "upload_time": datetime.datetime.fromtimestamp(r.upload_time).strftime("%Y-%m-%d %H:%M:%S"),
        "source": r.source,
        "ipv4s": ",".join(r.ipv4s),
        "ipv6s": ",".join(r.ipv6s),
        "urls": ",".join(r.domains),
        "yara_rules": json.dumps(list(r.yara_rules)),
        "cves": ",".join(r.cves),
        "sha256s": ",".join(r.sha256s),
        "md5s": ",".join(r.md5s),
        "sha1s": ",".join(r.sha1s),
        "mitre": json.dumps(r.mitre),
        "report_type": r.report_type.name,
        "web_url": r.url,
        "report_data_url": report_data_url,
    }

    with conn:
        conn.execute("""
            INSERT INTO report VALUES(
                :id,
                :publish_time,
                :title,
                :summary,
                :upload_time,
                :source,
                :ipv4s,
                :ipv6s,
                :urls,
                :yara_rules,
                :cves,
                :sha256s,
                :md5s,
                :sha1s,
                :mitre,
                :report_type,
                :web_url,
                :report_data_url
            )
        """, data)

In [35]:
feeds = []

with open("feeds.txt") as f:
    for line in f.readlines():
        data = feedparser.parse(line)
        if data.bozo:
            raise Exception(f"BOZO {line}")
            
        feeds.append(data)

In [126]:
# Connect to sqlite database
conn = sqlite3.connect("reports.db")
cur = conn.cursor()

# Initialize database
with open("schema.sql") as f:
    try:
        cur.executescript(f.read())
    except Exception as e:
        print(f"error applying schema: {e}")

In [140]:
for feed in feeds[:1]:
    source = feed.feed.title
    print(f"Reading posts from {source}")

    for entry in feed.entries[:10]:
        publish_time = time.mktime(entry.published_parsed)
        r = Report.from_url(url=entry.link, source=source, publish_time=publish_time)
        if 'title' in entry:
            r.title = entry.title
        if 'summary' in entry:
            r.summary = entry.summary

        r.parse_iocs()
        
        try:
            persist_report(r, conn)
        except sqlite3.IntegrityError as e:
            print(f"{entry.link} already present, skipping: {e}")
        
    print(f"Persisted posts from {source}")

Reading posts from WeLiveSecurity
https://www.welivesecurity.com/en/videos/fake-job-offers-target-coders-infostealers/ already present, skipping: UNIQUE constraint failed: report.web_url
https://www.welivesecurity.com/en/scams/no-youre-not-fired-beware-job-termination-scams/ already present, skipping: UNIQUE constraint failed: report.web_url
https://www.welivesecurity.com/en/we-live-science/katharine-hayhoe-most-important-climate-equation-starmus-highlights/ already present, skipping: UNIQUE constraint failed: report.web_url
https://www.welivesecurity.com/en/eset-research/deceptivedevelopment-targets-freelance-developers/ already present, skipping: UNIQUE constraint failed: report.web_url
https://www.welivesecurity.com/en/kids-online/gaming-gambling-lifting-lid-in-game-loot-boxes/ already present, skipping: UNIQUE constraint failed: report.web_url
https://www.welivesecurity.com/en/videos/what-is-penetration-testing-unlocked-403-cybersecurity-podcast-ep-10/ already present, skipping: UN