In [29]:
from __future__ import annotations # for python 3.7 only
import os
from os import listdir
from os.path import isfile, join
import pickle
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # must download at first execution

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Weber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# function for Pre-Process the documents
def preprocess(phrase: str) -> list:
    ''' pre-process raw text with multiple methods '''
    # lowerscase
    def lower(pharse: str) -> str:
        return pharse.lower()

    # thousands separators  # 1,222,333.00 > 1222333.00
    def replace_thousands(match):
        return match.group(0).replace(',', '')

    # no need this, we want to keep ip address
    # float separators      # 1,222,333.00 > 1,222,33300
    def replace_floats(match):
        return match.group(0).replace('.', '')

    # replace non-alpha/number charater with whitespace
    def text(phrase: str) -> str:
        # By Python definition '\W == [^a-zA-Z0-9_], which excludes all numbers, letters
        phrase = re.sub(r'[^a-zA-Z0-9]+', ' ', phrase)
        # phrase = re.sub(' +', ' ', phrase) # remove muti-space into single-space, we'll than apply spilt() so can skip it
        return phrase

    # tokenization
    def tokenize(phrase: str) -> list: 
        return phrase.split()

    def rm_period(token_lst: list) -> list: 
        return [t.rstrip('.') for t in token_lst]

    # remove stopwords
    def rm_stopwords(token_lst: list) -> list: 
        stop_words = stopwords.words('english')
        tokens_without_sw = [t for t in token_lst if not t in stop_words]
        return tokens_without_sw

    res = lower(phrase)
    res = re.sub(r"[\d]*,[\d]", replace_thousands, res)
    # res = text(res)
    res = tokenize(res)
    res = rm_period(res)
    res = rm_stopwords(res)
    return res

# unit test
preprocess('The malware was previously detected while being dropped by a malicious campaign that exploited \
    a critical server-side template injection. Indicators of Compromise (IOCs). 140.112.1.1 sending TCP SYN packets to port 2375.')

['malware',
 'previously',
 'detected',
 'dropped',
 'malicious',
 'campaign',
 'exploited',
 'critical',
 'server-side',
 'template',
 'injection',
 'indicators',
 'compromise',
 '(iocs)',
 '140.112.1.1',
 'sending',
 'tcp',
 'syn',
 'packets',
 'port',
 '2375']

In [16]:
# Utility functions
def get_all_filenames(dir: str='./') -> list:
    ''' traverse root directory, and list directories as dirs and files as files. Return filenames in rootdir. '''
    files = [f for f in listdir(dir) if isfile(join(dir, f))]
    files.sort()
    return files

def flatten(t):
    return [item for sublist in t for item in sublist]

Reports Extracetion Steps
1. preprocess into tokens
2. build tf of reports
3. store tf object

In [27]:
def calcTermFreq(filename: str, dir: str='./') -> dict:
    ''' return the term frequency for this document '''
    with open(dir+filename, 'r', encoding="utf-8") as f:
        content = f.read()
        token_lst = preprocess(content)
        del content
        terms = set(token_lst) # since df requres unique value

        tf = dict.fromkeys(terms, 0) # term frequency for this document
        for t in token_lst:
            tf[t] += 1
    return tf

In [38]:
reports = get_all_filenames()
reports = [r for r in reports if r.endswith('.txt')]
print(reports)

from pathlib import Path
Path("./term frequence pickle").mkdir(parents=True, exist_ok=True)

for r in reports:
    tf = calcTermFreq(r)
    r = r.rstrip('.txt')
    savefilename = f'./term frequence pickle/{r}.pkl'
    with open(savefilename, 'wb') as handle:
        pickle.dump(tf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    with open(savefilename, 'rb') as handle:
        load_test = pickle.load(handle)
    assert tf == load_test

['Dofloo-360InternetSecurityCenter.txt', 'Dofloo-BleepingComputer.txt', 'Dofloo-MalwareMustDie.txt', 'Dofloo-SyscallParty.txt', 'Dofloo-Trendmicro.txt']


In [39]:
with open('./term frequence pickle/Dofloo-SyscallParty.pkl', 'rb') as handle:
    load_test = pickle.load(handle)
load_test

{'brute': 1,
 'gethostbyname': 1,
 'speed;': 1,
 'achieved': 2,
 '1]': 1,
 '(apparently)': 1,
 'technically,': 1,
 'make': 1,
 'disassembly': 1,
 'programming': 2,
 'run': 1,
 'binaryninja:': 1,
 '32-bit': 1,
 'sed': 2,
 'body': 1,
 'we’ll': 1,
 'array': 2,
 '61.147.91.53': 1,
 'htons': 1,
 'point': 1,
 'come': 2,
 'arm': 4,
 'copied': 1,
 'kfts': 1,
 'file;': 1,
 'lrl': 1,
 'sz=688': 1,
 'systems)': 1,
 'starts': 1,
 'thence': 1,
 'daemons': 1,
 'onto': 4,
 'right': 2,
 'buffer': 4,
 'symlink': 2,
 'getting': 1,
 'forcing': 1,
 'compiled': 2,
 '|': 1,
 'many': 1,
 'bytes': 4,
 'windows': 1,
 'internally': 2,
 'explanatory': 1,
 'procedure': 2,
 'pascal': 1,
 'so,': 3,
 'image-title-here{:class="img-responsive"}': 8,
 'inherited': 1,
 'seeded': 1,
 'systems': 1,
 'done': 2,
 'operations': 2,
 'linked,': 1,
 'call': 6,
 'start': 1,
 'receives': 1,
 'auto_boot': 1,
 'architecture': 1,
 'embedded': 1,
 'address': 4,
 'user': 1,
 'another': 2,
 'system;': 1,
 'attention': 1,
 'successful,'