Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,28 @@ or
```python
df = read_csv("*.csv")
```
#### Post-Processing the Input Data
Both `read_json()` and `read_csv()` support an optional `post_function` parameter, which allows you to specify a function to post-process the data after each individual file is read in, before it is merged into the final returned DataFrame. For example, you might want to split or combine columns, or compute a new value from existing data.

Consult the Pandas documentation for information on supported options for `read_csv()` and `read_json()`.
Start by creating a post-processing function according to the following prototype:

```python
def my_post_processor(df, filename):
# do some stuff

return df
```

When called, the `df` parameter will be a DataFrame containing the chunk of data just read, and the `filename` parameter will be the name of the file it came from, which will be different for each chunk. **IT IS IMPORTANT THAT YOU RETURN `df` no matter whether you modified the input DataFrame or not.**

Once you have defined the post-processor function, you can invoke it during your call to `read_json()` or `read_csv()` like so:

```python
df = read_csv("*.csv", post_function=my_post_processor)
```

#### Additional Read Options
Consult the Pandas documentation for information on other supported options for `read_csv()` and `read_json()`.

### Normalizing nesting dicts and lists

Expand Down
2 changes: 1 addition & 1 deletion huntlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from huntlib.decorators import future_warning
import warnings

__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance', 'flatten']
__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance']

@future_warning("The huntlib.entropy() function has been moved to huntlib.util.entropy(). Please update your code. This compatibility will be removed in a future release.")
def entropy(*args, **kwargs):
Expand Down
27 changes: 13 additions & 14 deletions huntlib/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,36 @@

__all__ = ['read_json', 'read_csv', 'flatten']

def _read_multi(func=None, path_or_buf=None, *args, **kwargs):
def _read_multi(read_function=None, path_or_buf=None, post_function=None, *args, **kwargs):
"""
Given a wildcard filename pattern (which may be just a single static
filename), expand the wildcard and read all the files into a single
pandas DataFrame() object.

:param func: Reference to the function which will read an individual data file (e.g., pd.read_csv)
:param read_function: Reference to the function which will read an individual data file (e.g., pd.read_csv)
:param path_or_buf: A wildcard specifying which file(s) to read
:type func: A reference to a valid function which returns a pd.DataFrame() object
:type read_function: A reference to a valid function which returns a pd.DataFrame() object
:type path_or_buf: A `str`, `bytes` or os.PathLike object
"""

# Make sure we have specified a read function. This should never
# be called by an end user, so our code should always include one,
# but you never know.
if not func:
raise ValueError("Must specify a read function in the `func` arg.")
if not read_function:
raise ValueError("Must specify a read function in the `read_function` arg.")

# Make sure we have a valid type of data for `path_or_buf` in glob(),
# otherwise raise the same exception the original pandas function
# would
if not type(path_or_buf) in [str, bytes, os.PathLike]:
raise ValueError(f"Invalid file path or buffer object type: {type(path_or_buf)}")

combined_df = pd.concat(
[
func(f, *args, **kwargs)
for f in glob(path_or_buf)
],
ignore_index=True
)
combined_df = pd.DataFrame()
for f in glob(path_or_buf):
temp_df = read_function(f, *args, **kwargs)
if post_function:
temp_df = post_function(temp_df, f)
combined_df = combined_df.append(temp_df, ignore_index=True)

return combined_df

Expand All @@ -48,7 +47,7 @@ def read_json(path_or_buf=None, *args, **kwargs):
"""

return _read_multi(
func=pd.read_json,
read_function=pd.read_json,
path_or_buf=path_or_buf,
*args,
**kwargs
Expand All @@ -61,7 +60,7 @@ def read_csv(path_or_buf=None, *args, **kwargs):
"""

return _read_multi(
func=pd.read_csv,
read_function=pd.read_csv,
path_or_buf=path_or_buf,
*args,
**kwargs
Expand Down
14 changes: 13 additions & 1 deletion huntlib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,18 @@ def benfords(numbers):
'''

def _first_digit(i: float):
# This doesn't apply to zeros!
if i == 0:
return np.nan
# Make negative numbers positive
if i < 0:
i = abs(i)
# If the number is between 0 and 1, multiply by 10 until it becomes > 1
# so the repeated divisions will work
elif i < 1:
while i < 1:
i *= 10

while i >= 10:
i //= 10
return trunc(i)
Expand All @@ -157,7 +169,7 @@ def _first_digit(i: float):
numbers = numbers.values

numbers = pd.DataFrame(numbers, columns=['numbers'])
numbers['digits'] = numbers['numbers'].apply(_first_digit)
numbers['digits'] = numbers['numbers'].apply(_first_digit).dropna()

counts = numbers['digits'].value_counts()

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
long_description = fh.read()

setup(name='huntlib',
version='0.5.1',
version='0.5.3',
description='A Python library to help with some common threat hunting data analysis operations',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
57 changes: 57 additions & 0 deletions test-infra.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash

# Start / Stop the infra needed to do tests
case "$1" in
"start")
# Call ourselves to stop any running containers and reset the test environment
$0 stop

echo "****** Sleeping to allow containers to stop ******"
sleep 10

echo "****** Creating Elastic TLS Certs ******"
rm -rf support/certs
mkdir support/certs
docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip ; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"

echo "****** Starting Splunk Enterprise via Docker ******"
docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest
echo "****** Starting Elastic via Docker ******"
docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2
echo "****** Sleeping to allow containers to start ******"
sleep 60

echo "****** Loading Splunk data ******"
docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata

echo "****** Loading Elastic data ******"
docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt
echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @-
curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null
curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null
;;
"stop")
echo "****** Stopping any previous Splunk container ******"
docker kill splunk_test
docker stop splunk_test
docker rm splunk_test
echo "****** Stopping any previous Elastic containers ******"
docker kill create_elastic_certs
docker stop create_elastic_certs
docker rm create_elastic_certs
docker kill elastic_test
docker stop elastic_test
docker rm elastic_test
echo "****** Cleaning up artifacts ******"
rm -rf support/certs
rm -f /tmp/elastic_pass.txt
;;
*)
echo "Unknown command: $1"
exit -1
;;
esac

18 changes: 18 additions & 0 deletions tests/test_benfords_law.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,21 @@ def test_benfords_benfords(self):
0.99,
f"Chi square p-value was too low."
)

def test_benfords_floats_and_negatives(self):
nums = [1, 1.0, 0.001, 1, 1, 1, 1, 1, .02, 2, 2, 2.99,
3, 3.14159, 3, 4, 4, -5, 5, 6, 6, 7, 7, -8, 9]

chi2, p, counts = benfords(nums)

self.assertLessEqual(
chi2,
0.05,
f"The chosen distribution did not conform to Benford's law, but should have. (chisquare={chi2})"
)

self.assertGreaterEqual(
p,
0.99,
f"Chi square p-value was too low."
)
13 changes: 13 additions & 0 deletions tests/test_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python

# Just a set of 'tests' to make sure we can import * from
# all the modules in the package. Guess why we have this??

from huntlib import *
from huntlib.data import *
from huntlib.util import *
from huntlib.elastic import *
from huntlib.splunk import *
from huntlib.exceptions import *
from huntlib.decorators import *

38 changes: 38 additions & 0 deletions tests/test_multi_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,42 @@ def test_read_csv(self):
self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")

def test_read_json_post_process(self):
def _post_process(df, filename):
if 'ts' in df.columns:
df = df.drop('ts', axis='columns')
df['filename'] = filename
return df

df = huntlib.data.read_json(
"support/*.json",
lines=True,
post_function=_post_process
)

(rows, cols) = df.shape

self.assertEqual(cols, 6, "The resulting DataFrame had the wrong number of columns.")
self.assertEqual(rows, 3000015, "The resulting DataFrame had the wrong number of rows.")
self.assertEqual(df.index.nunique(), 3000015, "DataFrame index values are not unique.")
self.assertNotIn('ts', df.columns, "The 'ts' field was present, but should have been dropped in post processing.")
self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")

def test_read_csv_post_process(self):
def _post_process(df, filename):
if 'c' in df.columns:
df = df.drop('c', axis='columns')
df['filename'] = 'filename'
return df

df = huntlib.data.read_csv(
"support/*.csv",
post_function=_post_process
)
(rows, cols) = df.shape

self.assertEqual(cols, 3, "The resulting DataFrame had the wrong number of columns.")
self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")
self.assertNotIn('c', df.columns, "The 'c' field was present, but should have been dropped in post processing.")
self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")
43 changes: 3 additions & 40 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -21,50 +21,13 @@ whitelist_externals =
mkdir

commands_pre =
echo "****** Stopping any previous Splunk container ******"
bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
echo "****** Stopping any previous Elastic containers ******"
bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
sleep 10

echo "****** Creating Elastic TLS Certs ******"
rm -rf support/certs
mkdir support/certs
bash -c 'docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"'

echo "****** Starting Splunk Enterprise via Docker ******"
bash -c 'docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest'
echo "****** Starting Elastic via Docker ******"
bash -c 'docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2'
echo "****** Sleeping to allow containers to start ******"
sleep 60

echo "****** Loading Splunk data ******"
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass'
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata'
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main'
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata'

echo "****** Loading Elastic data ******"
bash -c 'docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt'
bash -c 'echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @-'
bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null'
bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null'

bash -c './test-infra.sh start'
echo "****** Sleeping again to allow time for indexing ******"
sleep 20

commands =
python setup.py test

commands_post =
echo "****** Stopping any previous Splunk container ******"
bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
echo "****** Stopping any previous Elastic containers ******"
bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
echo "****** Cleaning up artifacts ******"
rm -rf support/certs
rm -f /tmp/elastic_pass.txt

bash -c './test-infra.sh stop'