Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 486 lines (412 sloc) 16.9 KB
Plugin to find python strings within process heaps.
import os
import re
import struct
from itertools import groupby
from volatility import debug as debug
from volatility import obj as obj
from volatility.plugins.linux import common as linux_common
from volatility.plugins.linux import pslist as linux_pslist
from volatility.renderers import TreeGrid
from volatility import utils
# Note: It doesn't actually matter if Py_TRACE_REF is defined, that just means
# there are more structures at the beginning, which we don't care about
pyobjs_vtype_64 = {
'_PyStringObject': [
'ob_refcnt': [0, ['long long']], # Py_ssize_t = ssize_t
'ob_type': [8, ['pointer', ['void']]], # struct _typeobject *
'ob_size': [16, ['long long']], # Py_ssize_t = ssize_t
'ob_shash': [24, ['long long']],
'ob_sstate': [32, ['Enumeration',
dict(target='int', choices={
'ob_sval': [36, ['array', 1, ['char']]]
'_PyDictEntry': [
'me_hash': [0, ['long long']], # Py_ssize_t = ssize_t
'me_key': [8, ['pointer', ['_PyStringObject']]],
'me_value': [16, ['pointer', ['_PyStringObject']]]
class _PyStringObject(obj.CType):
A class for python string objects.
typedef struct {
long ob_shash;
int ob_sstate;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the string or -1 if not computed yet.
* ob_sstate != 0 iff the string object is in stringobject.c's
* 'interned' dictionary; in this case the two references
* from 'interned' to this object are *not counted* in
* ob_refcnt.
} PyStringObject;
object.h - note that _PyObject_HEAD_EXTRA is empty if
Py_TRACE_REFs is not defined
/* PyObject_HEAD defines the initial segment of every PyObject. */
#define PyObject_HEAD \
_PyObject_HEAD_EXTRA \
Py_ssize_t ob_refcnt; \
struct _typeobject *ob_type;
#define PyObject_VAR_HEAD \
PyObject_HEAD \
Py_ssize_t ob_size; /* Number of items in variable part */
def is_valid(self):
Determine whether the Python string struct is valid - an easy way to
check is to calculate the hash of the string, and see if it matches
the `ob_shash`.
On Python 2.7, the hash function used is FNV.
This assumes that the python version volatility is using matches the
python version of the memory dump, because it uses the `hash()`
function to compute the hash.
ob_sval_offset, _ = self.members['ob_sval']
string_address = self.obj_offset + ob_sval_offset
return (
self.ob_type.is_valid() and
# skip empty strings and strings that are too big
self.ob_size > 0 and self.ob_size <= 1e6 and
# state must be one of the valid states
self.ob_sstate.v() in self.ob_sstate.choices.keys() and
# the string should be null-terminated
self.obj_vm.zread(string_address + self.ob_size, 1) == '\x00' and
# the hash may not have been computed (-1), but otherwise
# it should be correct
(self.ob_shash == -1 or self.ob_shash == hash(self.string)))
def string(self):
Read the string from memory, because `ob_sval` is a
:class:`volatility.obj.NativeType.Array` object, which is slow to
iterate through to turn into a string.
sval_offset, _ = self.members['ob_sval']
return self.obj_vm.zread(self.obj_offset + sval_offset,
class _StringStringPyDictEntry(obj.CType):
typedef struct {
Py_ssize_t me_hash;
PyObject *me_key;
PyObject *me_value;
} PyDictEntry;
/* Nothing is actually declared to be a PyObject, but every pointer to
* a Python object can be cast to a PyObject*. This is inheritance built
* by hand. Similarly every pointer to a variable-size Python object can,
* in addition, be cast to PyVarObject*.
typedef struct _object {
} PyObject;
def is_valid(self):
Determine whether the {Python string key: Python string val}
PyDictEntry struct is valid.
Both pointers should be valid, and the hash of the entry should be
the same as the hash of the key.
if self.me_key.is_valid() and self.me_value.is_valid():
key = self.key
if key.is_valid() and key.ob_shash == self.me_hash:
return self.value.is_valid()
return False
def key(self):
return self.me_key.dereference()
def value(self):
return self.me_value.dereference()
class PythonStringTypes(obj.ProfileModification):
Profile modifications for Python string types. Only Linux and Mac OS,
on 64-bit systems, are supported right now.
conditions = {"os": lambda x: x in ["linux", "mac"],
"memory_model": lambda x: x == "64bit"}
def modification(self, profile):
Add python string overlays to the profile's vtypes.
"_PyStringObject": _PyStringObject,
"_PyDictEntry": _StringStringPyDictEntry
def brute_force_search(addr_space, obj_type_string, start, end, step_size=1):
Brute-force search an area of memory for a given object type. Returns
valid types as a generator.
offset = start
while offset < end:
found_object = obj.Object(obj_type_string,
if found_object.is_valid():
yield found_object
offset += found_object.size() + found_object.ob_size
offset += step_size
def _brute_force_5_strings(addr_space, heaps):
Search the heaps 5K at a time until 5 strings are found. Why 5?
Arbitrary. Just so long as it's not 1, which may be a false positive.
bfed_strings = []
chunk_size = 1024 * 5
for heap_vma in heaps:
for chunk_start in xrange(heap_vma.vm_start,
end=chunk_start + chunk_size - 1,
if len(bfed_strings) >= 5:
return bfed_strings
def find_python_strings(task):
Attempt to find python strings. Brute-force search is pretty slow, so we
are going to optimize a bit.
The `ob_type` of a PyObjString is a pretty involved struct, so we are not
searching on that pattern, but all Python strings should point to the
same type in memory.
We will brute-force search the heaps only until a couple of strings are
found. We want to make sure that they all point to the same type in
memory. Once we have a good guess at where that type resides in memory,
we can search specifically for that address value and use that as a hint
as to where there might be a PyObjString.
We want to search the rest of memory though
addr_space = task.get_process_address_space()
heaps_and_anon = get_heaps_and_anon(task)
likely_strings = _brute_force_5_strings(addr_space, heaps_and_anon)
likely_strings_by_type = {
pointer: list(strings) for pointer, strings
in groupby(likely_strings, lambda pystr: pystr.ob_type)
debug.debug("Found {0} possible str _typeobject pointer(s): {1}".format(
", ".join([
"0x{0:012x} ({1})".format(pointer.v(), len(strings))
for pointer, strings in likely_strings_by_type.iteritems()])))
memory_model = addr_space.profile.metadata.get('memory_model', '32bit')
pack_format = "I" if memory_model == '32bit' else "Q"
offset = addr_space.profile.get_obj_offset("_PyStringObject", "ob_type")
str_types_as_bytes = [struct.pack(pack_format, pointer.v())
for pointer in likely_strings_by_type]
for address in search_vmas(str_types_as_bytes, heaps_and_anon, task):
# We will find the likely_strings again, but that's ok
py_string = obj.Object("_PyStringObject",
offset=address - offset,
if py_string.is_valid():
yield py_string
def search_vmas(s, vmas, task):
Searches VMAs for lists of strings.
could be used, but we want to search more than the heap and less than all
of process memory.
This code is mostly copied from there.
# Allow for some overlap in case objects are
# right on page boundaries
overlap = 1024
scan_blk_sz = 1024 * 1024 * 10
addr_space = task.get_process_address_space()
for vma in vmas:
offset = vma.vm_start
out_of_range = vma.vm_start + (vma.vm_end - vma.vm_start)
while offset < out_of_range:
# Read some data and match it.
to_read = min(scan_blk_sz + overlap, out_of_range - offset)
data = addr_space.zread(offset, to_read)
if not data:
for x in s:
for hit in utils.iterfind(data, x):
yield offset + hit
offset += min(to_read, scan_blk_sz)
def get_heaps_and_anon(task):
Given a task, return the mapped sections corresponding to that task's
heaps and anonymous mappings (since CPython sometimes mmaps things).
for vma in task.get_proc_maps():
if (vma.vm_start <= and vma.vm_end >=
yield vma
elif vma.vm_name(task) == "Anonymous Mapping":
yield vma
def _is_python_task(task):
Return true if this is a python task (as per the executable name, not
necessarily by task name), false otherwise.
code_area = [vma for vma in task.get_proc_maps()
if ( >= vma.vm_start and <= vma.vm_end)]
return code_area and 'python' in code_area[0].vm_name(task)
class linux_python_strings(linux_pslist.linux_pslist):
Pull python strings from a process's heap.
def __init__(self, config, *args, **kwargs):
Add a configuration for checking strings, basically a regex to check
linux_pslist.linux_pslist.__init__(self, config, *args, **kwargs)
'REGEX', default='None', type='string',
help='Provide a regex: only return strings that match the regex.')
'DUMP-DIR', default='None', type='string',
help='Output strings to file(s) in this dump directory.')
def _validate_config(self):
Check the config values, and converts them to the right value.
if self._config.REGEX:
self._config.REGEX = re.compile(self._config.REGEX)
if (self._config.DUMP_DIR is not None and
not os.path.isdir(os.path.expanduser(self._config.DUMP_DIR))):
debug.error(self._config.DUMP_DIR + " is not a directory")
self._config.DUMP_DIR = None
def calculate(self):
Find the tasks that are actually python processes. May not
necessarily be called "python", but the executable is python.
Then find all python strings in that process's heap.
tasks = [task for task in linux_pslist.linux_pslist.calculate(self)
if _is_python_task(task)]
for task in tasks:
for py_string in find_python_strings(task):
if (self._config.REGEX is None or
yield task, py_string
def unified_output(self, data):
Return a TreeGrid with data to print out.
return TreeGrid([("Pid", int),
("Name", str),
("Size", int),
("String", str)],
def generator(self, data):
If writing to a file is desired, write to a file. Also generate data
that may be formatted for printing.
files = {}
for task, py_string in data:
if self._config.DUMP_DIR is not None:
filename = "{0}.{1}.strings".format(, task.comm)
if not in files:
files[] = open(os.path.expanduser(os.path.join(
self._config.DUMP_DIR, filename)), 'wb')
yield (0, [int(,
for file_handle in files.values():
def render_text(self, outfd, data):
self.table_header(outfd, [("Pid", "15"),
("Name", "10"),
("Size", "10"),
("String", "50")])
for _, output in self.generator(data):
self.table_row(outfd, *[str(o) for o in output])
class linux_python_str_dict_entry(linux_pslist.linux_pslist):
Pull {python-strings: python-string} dictionary entries from a process's
def calculate(self):
Get all the python strings for a task, and assume those strings
might be keys of a dictionary entry. Return the valid dictionary
entries from that pool of maybes.
This repeats a lot of linux_python_strings's code, but we want to get
python strings per task, so we can optimize the bytstring search.
tasks = [task for task in linux_pslist.linux_pslist.calculate(self)
if _is_python_task(task)]
for task in tasks:
addr_space = task.get_process_address_space()
memory_model = addr_space.profile.metadata.get('memory_model',
pack_format = "I" if memory_model == '32bit' else "Q"
bytestrings = [
# the hash as bytes
struct.pack(pack_format.lower(), py_string.ob_shash) +
# the pointer the PyStringObject as bytes
struct.pack(pack_format, py_string.obj_offset)
for py_string in find_python_strings(task)
for address in task.search_process_memory(bytestrings,
py_dict_entry = obj.Object("_PyDictEntry",
if py_dict_entry.is_valid():
yield task, py_dict_entry
def unified_output(self, data):
Return a TreeGrid with data to print out.
return TreeGrid([("Pid", int),
("Name", str),
("Key", str),
("Value", str)],
def generator(self, data):
Generate data that may be formatted for printing.
for task, py_dict_entry in data:
yield (0, [int(,
You can’t perform that action at this time.