diff --git a/README.md b/README.md index 0b65362..21088d5 100755 --- a/README.md +++ b/README.md @@ -362,8 +362,28 @@ or ```python df = read_csv("*.csv") ``` +#### Post-Processing the Input Data +Both `read_json()` and `read_csv()` support an optional `post_function` parameter, which allows you to specify a function to post-process the data after each individual file is read in, before it is merged into the final returned DataFrame. For example, you might want to split or combine columns, or compute a new value from existing data. -Consult the Pandas documentation for information on supported options for `read_csv()` and `read_json()`. +Start by creating a post-processing function according to the following prototype: + +```python +def my_post_processor(df, filename): + # do some stuff + + return df +``` + +When called, the `df` parameter will be a DataFrame containing the chunk of data just read, and the `filename` parameter will be the name of the file it came from, which will be different for each chunk. **IT IS IMPORTANT THAT YOU RETURN `df` no matter whether you modified the input DataFrame or not.** + +Once you have defined the post-processor function, you can invoke it during your call to `read_json()` or `read_csv()` like so: + +```python +df = read_csv("*.csv", post_function=my_post_processor) +``` + +#### Additional Read Options +Consult the Pandas documentation for information on other supported options for `read_csv()` and `read_json()`. ### Normalizing nesting dicts and lists diff --git a/huntlib/__init__.py b/huntlib/__init__.py index 0acb2db..807325d 100644 --- a/huntlib/__init__.py +++ b/huntlib/__init__.py @@ -4,7 +4,7 @@ from huntlib.decorators import future_warning import warnings -__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance', 'flatten'] +__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance'] @future_warning("The huntlib.entropy() function has been moved to huntlib.util.entropy(). Please update your code. This compatibility will be removed in a future release.") def entropy(*args, **kwargs): diff --git a/huntlib/data.py b/huntlib/data.py index 7213e4f..5766af6 100644 --- a/huntlib/data.py +++ b/huntlib/data.py @@ -7,23 +7,23 @@ __all__ = ['read_json', 'read_csv', 'flatten'] -def _read_multi(func=None, path_or_buf=None, *args, **kwargs): +def _read_multi(read_function=None, path_or_buf=None, post_function=None, *args, **kwargs): """ Given a wildcard filename pattern (which may be just a single static filename), expand the wildcard and read all the files into a single pandas DataFrame() object. - :param func: Reference to the function which will read an individual data file (e.g., pd.read_csv) + :param read_function: Reference to the function which will read an individual data file (e.g., pd.read_csv) :param path_or_buf: A wildcard specifying which file(s) to read - :type func: A reference to a valid function which returns a pd.DataFrame() object + :type read_function: A reference to a valid function which returns a pd.DataFrame() object :type path_or_buf: A `str`, `bytes` or os.PathLike object """ # Make sure we have specified a read function. This should never # be called by an end user, so our code should always include one, # but you never know. - if not func: - raise ValueError("Must specify a read function in the `func` arg.") + if not read_function: + raise ValueError("Must specify a read function in the `read_function` arg.") # Make sure we have a valid type of data for `path_or_buf` in glob(), # otherwise raise the same exception the original pandas function @@ -31,13 +31,12 @@ def _read_multi(func=None, path_or_buf=None, *args, **kwargs): if not type(path_or_buf) in [str, bytes, os.PathLike]: raise ValueError(f"Invalid file path or buffer object type: {type(path_or_buf)}") - combined_df = pd.concat( - [ - func(f, *args, **kwargs) - for f in glob(path_or_buf) - ], - ignore_index=True - ) + combined_df = pd.DataFrame() + for f in glob(path_or_buf): + temp_df = read_function(f, *args, **kwargs) + if post_function: + temp_df = post_function(temp_df, f) + combined_df = combined_df.append(temp_df, ignore_index=True) return combined_df @@ -48,7 +47,7 @@ def read_json(path_or_buf=None, *args, **kwargs): """ return _read_multi( - func=pd.read_json, + read_function=pd.read_json, path_or_buf=path_or_buf, *args, **kwargs @@ -61,7 +60,7 @@ def read_csv(path_or_buf=None, *args, **kwargs): """ return _read_multi( - func=pd.read_csv, + read_function=pd.read_csv, path_or_buf=path_or_buf, *args, **kwargs diff --git a/huntlib/util.py b/huntlib/util.py index 3b844eb..af38f14 100755 --- a/huntlib/util.py +++ b/huntlib/util.py @@ -135,6 +135,18 @@ def benfords(numbers): ''' def _first_digit(i: float): + # This doesn't apply to zeros! + if i == 0: + return np.nan + # Make negative numbers positive + if i < 0: + i = abs(i) + # If the number is between 0 and 1, multiply by 10 until it becomes > 1 + # so the repeated divisions will work + elif i < 1: + while i < 1: + i *= 10 + while i >= 10: i //= 10 return trunc(i) @@ -157,7 +169,7 @@ def _first_digit(i: float): numbers = numbers.values numbers = pd.DataFrame(numbers, columns=['numbers']) - numbers['digits'] = numbers['numbers'].apply(_first_digit) + numbers['digits'] = numbers['numbers'].apply(_first_digit).dropna() counts = numbers['digits'].value_counts() diff --git a/setup.py b/setup.py index 2b1af89..dac9887 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ long_description = fh.read() setup(name='huntlib', - version='0.5.1', + version='0.5.3', description='A Python library to help with some common threat hunting data analysis operations', long_description=long_description, long_description_content_type="text/markdown", diff --git a/test-infra.sh b/test-infra.sh new file mode 100755 index 0000000..7512ec3 --- /dev/null +++ b/test-infra.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Start / Stop the infra needed to do tests +case "$1" in + "start") + # Call ourselves to stop any running containers and reset the test environment + $0 stop + + echo "****** Sleeping to allow containers to stop ******" + sleep 10 + + echo "****** Creating Elastic TLS Certs ******" + rm -rf support/certs + mkdir support/certs + docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip ; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs" + + echo "****** Starting Splunk Enterprise via Docker ******" + docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest + echo "****** Starting Elastic via Docker ******" + docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2 + echo "****** Sleeping to allow containers to start ******" + sleep 60 + + echo "****** Loading Splunk data ******" + docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass + docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata + docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main + docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata + + echo "****** Loading Elastic data ******" + docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt + echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @- + curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null + curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null + ;; + "stop") + echo "****** Stopping any previous Splunk container ******" + docker kill splunk_test + docker stop splunk_test + docker rm splunk_test + echo "****** Stopping any previous Elastic containers ******" + docker kill create_elastic_certs + docker stop create_elastic_certs + docker rm create_elastic_certs + docker kill elastic_test + docker stop elastic_test + docker rm elastic_test + echo "****** Cleaning up artifacts ******" + rm -rf support/certs + rm -f /tmp/elastic_pass.txt + ;; + *) + echo "Unknown command: $1" + exit -1 + ;; +esac + diff --git a/tests/test_benfords_law.py b/tests/test_benfords_law.py index 72bae74..428254c 100644 --- a/tests/test_benfords_law.py +++ b/tests/test_benfords_law.py @@ -43,3 +43,21 @@ def test_benfords_benfords(self): 0.99, f"Chi square p-value was too low." ) + + def test_benfords_floats_and_negatives(self): + nums = [1, 1.0, 0.001, 1, 1, 1, 1, 1, .02, 2, 2, 2.99, + 3, 3.14159, 3, 4, 4, -5, 5, 6, 6, 7, 7, -8, 9] + + chi2, p, counts = benfords(nums) + + self.assertLessEqual( + chi2, + 0.05, + f"The chosen distribution did not conform to Benford's law, but should have. (chisquare={chi2})" + ) + + self.assertGreaterEqual( + p, + 0.99, + f"Chi square p-value was too low." + ) diff --git a/tests/test_imports.py b/tests/test_imports.py new file mode 100755 index 0000000..e6524ea --- /dev/null +++ b/tests/test_imports.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python + +# Just a set of 'tests' to make sure we can import * from +# all the modules in the package. Guess why we have this?? + +from huntlib import * +from huntlib.data import * +from huntlib.util import * +from huntlib.elastic import * +from huntlib.splunk import * +from huntlib.exceptions import * +from huntlib.decorators import * + diff --git a/tests/test_multi_reads.py b/tests/test_multi_reads.py index e09bb1d..bb7ccd8 100644 --- a/tests/test_multi_reads.py +++ b/tests/test_multi_reads.py @@ -22,4 +22,42 @@ def test_read_csv(self): self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.") self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.") + def test_read_json_post_process(self): + def _post_process(df, filename): + if 'ts' in df.columns: + df = df.drop('ts', axis='columns') + df['filename'] = filename + return df + + df = huntlib.data.read_json( + "support/*.json", + lines=True, + post_function=_post_process + ) + (rows, cols) = df.shape + + self.assertEqual(cols, 6, "The resulting DataFrame had the wrong number of columns.") + self.assertEqual(rows, 3000015, "The resulting DataFrame had the wrong number of rows.") + self.assertEqual(df.index.nunique(), 3000015, "DataFrame index values are not unique.") + self.assertNotIn('ts', df.columns, "The 'ts' field was present, but should have been dropped in post processing.") + self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.") + + def test_read_csv_post_process(self): + def _post_process(df, filename): + if 'c' in df.columns: + df = df.drop('c', axis='columns') + df['filename'] = 'filename' + return df + + df = huntlib.data.read_csv( + "support/*.csv", + post_function=_post_process + ) + (rows, cols) = df.shape + + self.assertEqual(cols, 3, "The resulting DataFrame had the wrong number of columns.") + self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.") + self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.") + self.assertNotIn('c', df.columns, "The 'c' field was present, but should have been dropped in post processing.") + self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.") diff --git a/tox.ini b/tox.ini index 109f950..03e35a8 100644 --- a/tox.ini +++ b/tox.ini @@ -21,37 +21,7 @@ whitelist_externals = mkdir commands_pre = - echo "****** Stopping any previous Splunk container ******" - bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true' - echo "****** Stopping any previous Elastic containers ******" - bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true' - bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true' - sleep 10 - - echo "****** Creating Elastic TLS Certs ******" - rm -rf support/certs - mkdir support/certs - bash -c 'docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"' - - echo "****** Starting Splunk Enterprise via Docker ******" - bash -c 'docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest' - echo "****** Starting Elastic via Docker ******" - bash -c 'docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2' - echo "****** Sleeping to allow containers to start ******" - sleep 60 - - echo "****** Loading Splunk data ******" - bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass' - bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata' - bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main' - bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata' - - echo "****** Loading Elastic data ******" - bash -c 'docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt' - bash -c 'echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @-' - bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null' - bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null' - + bash -c './test-infra.sh start' echo "****** Sleeping again to allow time for indexing ******" sleep 20 @@ -59,12 +29,5 @@ commands = python setup.py test commands_post = - echo "****** Stopping any previous Splunk container ******" - bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true' - echo "****** Stopping any previous Elastic containers ******" - bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true' - bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true' - echo "****** Cleaning up artifacts ******" - rm -rf support/certs - rm -f /tmp/elastic_pass.txt - + bash -c './test-infra.sh stop' +