Skip to content

Commit

Permalink
Unicorn Preload (#1356)
Browse files Browse the repository at this point in the history
Modifies the Unicorn emulator module to allow it to be used for "preloading" large binaries. It uses Unicorn to execute x64 instructions in bulk while Manticore handles IO and syscalls. State changes are aggressively written from Manticore back to Unicorn, and lazily written from Unicorn to Manticore before a syscall. Can be used to concretely execute entire binaries, but should not be used once symbolic data has been introduced. 

The following script demonstrates this. The user can register a plugin at startup that tells Manticore to use Unicorn to quickly execute the initialization instructions leading up to the start of `main`. The binary in question is [`multiple-styles`](https://gist.github.com/ehennenfent/a5ad9746615d1490c618a88b98769c10) from an old example.

```python
from manticore.native import Manticore
from manticore.core.plugin import Plugin

address_of_main = 0x4009ae

class concretePlugin(Plugin):

    def will_start_run_callback(self, state, *_args):
        state.cpu.emulate_until(address_of_main)

m = Manticore("multiple-styles", concrete_start='coldlikeminisoda')
m.register_plugin(concretePlugin())
m.run()
```

Once `main` is reached, Manticore takes over and continues execution as normal. In the case of this example, the full solution is `coldlikeminisodas`, so Manticore generates two different test cases for the remaining byte. The performance improvement for this example is only marginal because the bulk of the time is taken up by the solver, and relatively little initialization is required before executing `main`. However, more complex binaries can see very significant speedups.
  • Loading branch information
ehennenfent committed Feb 26, 2019
1 parent cdae2bc commit bc77660
Show file tree
Hide file tree
Showing 13 changed files with 865 additions and 237 deletions.
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ os:
- linux
language: python
python:
- 3.6.5
- 3.6.6

stages:
- prepare
Expand All @@ -30,8 +30,8 @@ branches:
cache:
pip: true
directories:
- $HOME/virtualenv/python3.6.5/lib/python3.6/site-packages
- $HOME/virtualenv/python3.6.5/bin/
- $HOME/virtualenv/python3.6.6/lib/python3.6/site-packages
- $HOME/virtualenv/python3.6.6/bin/

jobs:
include:
Expand Down
3 changes: 3 additions & 0 deletions manticore/core/smtlib/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,9 @@ def array(self):
def index(self):
return self.operands[1]

def __repr__(self):
return f"<ArraySelect obj with index={self.index}:\n{self.array}>"


class BitVecSignExtend(BitVecOperation):
def __init__(self, operand, size_dest, *args, **kwargs):
Expand Down
169 changes: 146 additions & 23 deletions manticore/native/cpu/abstractcpu.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
import inspect
import io
import logging
import struct
from functools import wraps
from itertools import islice

import io
import struct
import unicorn
from functools import wraps

from .disasm import init_disassembler
from ..memory import ConcretizeMemory, InvalidMemoryAccess, LazySMemory
from ...core.smtlib import BitVec, Operators, Constant, visitors
from ..memory import (
ConcretizeMemory, InvalidMemoryAccess, FileMap, AnonMap
)
from ..memory import LazySMemory
from ...core.smtlib import Expression, BitVec, Operators, Constant
from ...core.smtlib import visitors
from ...core.smtlib.solver import solver
from ...utils.emulate import UnicornEmulator
from ...utils.emulate import ConcreteUnicornEmulator
from ...utils.event import Eventful
from ...utils.fallback_emulator import UnicornEmulator
from ...utils.helpers import issymbolic

from capstone.x86 import X86_REG_ENDING

logger = logging.getLogger(__name__)
register_logger = logging.getLogger(f'{__name__}.registers')

Expand Down Expand Up @@ -139,6 +146,9 @@ def _reg_name(self, reg_id):
:param int reg_id: Register ID
'''
if reg_id >= X86_REG_ENDING:
logger.warning("Trying to get register name for a non-register")
return None
cs_reg_name = self.cpu.instruction.reg_name(reg_id)
if cs_reg_name is None or cs_reg_name.lower() == '(invalid)':
return None
Expand Down Expand Up @@ -442,7 +452,7 @@ class Cpu(Eventful):
'''

_published_events = {'write_register', 'read_register', 'write_memory', 'read_memory', 'decode_instruction',
'execute_instruction'}
'execute_instruction', 'set_descriptor', 'map_memory', 'protect_memory', 'unmap_memory'}

def __init__(self, regfile, memory, **kwargs):
assert isinstance(regfile, RegisterFile)
Expand All @@ -453,6 +463,9 @@ def __init__(self, regfile, memory, **kwargs):
self._instruction_cache = {}
self._icount = 0
self._last_pc = None
self._concrete = kwargs.pop("concrete", False)
self.emu = None
self._break_unicorn_at = None
if not hasattr(self, "disasm"):
self.disasm = init_disassembler(self._disasm, self.arch, self.mode)
# Ensure that regfile created STACK/PC aliases
Expand All @@ -466,15 +479,19 @@ def __getstate__(self):
state['icount'] = self._icount
state['last_pc'] = self._last_pc
state['disassembler'] = self._disasm
state['concrete'] = self._concrete
state['break_unicorn_at'] = self._break_unicorn_at
return state

def __setstate__(self, state):
Cpu.__init__(self, state['regfile'],
state['memory'],
disasm=state['disassembler'])
disasm=state['disassembler'], concrete=state['concrete'])
self._icount = state['icount']
self._last_pc = state['last_pc']
self._disasm = state['disassembler']
self._concrete = state['concrete']
self._break_unicorn_at = state['break_unicorn_at']
super().__setstate__(state)

@property
Expand Down Expand Up @@ -563,6 +580,18 @@ def __setattr__(self, name, value):
except AttributeError:
object.__setattr__(self, name, value)

def emulate_until(self, target: int):
"""
Tells the CPU to set up a concrete unicorn emulator and use it to execute instructions
until target is reached.
:param target: Where Unicorn should hand control back to Manticore. Set to 0 for all instructions.
"""
self._concrete = True
self._break_unicorn_at = target
if self.emu:
self.emu._stop_at = target

#############################
# Memory access
@property
Expand All @@ -589,6 +618,40 @@ def write_int(self, where, expression, size=None, force=False):

self._publish('did_write_memory', where, expression, size)

def _raw_read(self, where: int, size=1) -> bytes:
"""
Selects bytes from memory. Attempts to do so faster than via read_bytes.
:param where: address to read from
:param size: number of bytes to read
:return: the bytes in memory
"""
map = self.memory.map_containing(where)
start = map._get_offset(where)
mapType = type(map)
if mapType is FileMap:
end = map._get_offset(where + size)

if end > map._mapped_size:
logger.warning(f"Missing {end - map._mapped_size} bytes at the end of {map._filename}")

raw_data = map._data[map._get_offset(where): min(end, map._mapped_size)]
if len(raw_data) < end:
raw_data += b'\x00' * (end - len(raw_data))

data = b''
for offset in sorted(map._overlay.keys()):
data += raw_data[len(data):offset]
data += map._overlay[offset]
data += raw_data[len(data):]

elif mapType is AnonMap:
data = bytes(map._data[start:start + size])
else:
data = b''.join(self.memory[where:where + size])
assert len(data) == size, 'Raw read resulted in wrong data read which should never happen'
return data

def read_int(self, where, size=None, force=False):
'''
Reads int from memory
Expand Down Expand Up @@ -620,8 +683,28 @@ def write_bytes(self, where, data, force=False):
:type data: str or list
:param force: whether to ignore memory permissions
'''
for i in range(len(data)):
self.write_int(where + i, Operators.ORD(data[i]), 8, force)

mp = self.memory.map_containing(where)
# TODO (ehennenfent) - fast write can have some yet-unstudied unintended side effects.
# At the very least, using it in non-concrete mode will break the symbolic strcmp/strlen models. The 1024 byte
# minimum is intended to minimize the potential effects of this by ensuring that if there _are_ any other
# issues, they'll only crop up when we're doing very large writes, which are fairly uncommon.
can_write_raw = type(mp) is AnonMap and \
isinstance(data, (str, bytes)) and \
(mp.end - mp.start + 1) >= len(data) >= 1024 and \
not issymbolic(data) and \
self._concrete

if can_write_raw:
logger.debug("Using fast write")
offset = mp._get_offset(where)
if isinstance(data, str):
data = bytes(data.encode('utf-8'))
mp._data[offset:offset + len(data)] = data
self._publish('did_write_memory', where, data, 8 * len(data))
else:
for i in range(len(data)):
self.write_int(where + i, Operators.ORD(data[i]), 8, force)

def read_bytes(self, where, size, force=False):
'''
Expand Down Expand Up @@ -778,7 +861,7 @@ def decode_instruction(self, pc):
policy='INSTRUCTION')
text += c

#Pad potentially incomplete instruction with zeroes
# Pad potentially incomplete instruction with zeroes
code = text.ljust(self.max_instr_width, b'\x00')

try:
Expand Down Expand Up @@ -840,17 +923,25 @@ def execute(self):
register_logger.debug(l)

try:
implementation = getattr(self, name, None)

if implementation is not None:
implementation(*insn.operands)

else:
text_bytes = ' '.join('%02x' % x for x in insn.bytes)
logger.warning("Unimplemented instruction: 0x%016x:\t%s\t%s\t%s",
insn.address, text_bytes, insn.mnemonic, insn.op_str)
if self._concrete and 'SYSCALL' in name:
self.emu.sync_unicorn_to_manticore()
if self._concrete and 'SYSCALL' not in name:
self.emulate(insn)
if self.PC == self._break_unicorn_at:
logger.debug("Switching from Unicorn to Manticore")
self._break_unicorn_at = None
self._concrete = False
else:
implementation = getattr(self, name, None)

if implementation is not None:
implementation(*insn.operands)

else:
text_bytes = ' '.join('%02x' % x for x in insn.bytes)
logger.warning("Unimplemented instruction: 0x%016x:\t%s\t%s\t%s",
insn.address, text_bytes, insn.mnemonic, insn.op_str)
self.backup_emulate(insn)
except (Interruption, Syscall) as e:
e.on_handled = lambda: self._publish_instruction_as_executed(insn)
raise e
Expand All @@ -868,16 +959,48 @@ def _publish_instruction_as_executed(self, insn):
self._publish('did_execute_instruction', self._last_pc, self.PC, insn)

def emulate(self, insn):
"""
Pick the right emulate function (maintains API compatiblity)
:param insn: single instruction to emulate/start emulation from
"""

if self._concrete:
self.concrete_emulate(insn)
else:
self.backup_emulate(insn)

def concrete_emulate(self, insn):
"""
Start executing in Unicorn from this point until we hit a syscall or reach break_unicorn_at
:param capstone.CsInsn insn: The instruction object to emulate
"""

if not self.emu:
self.emu = ConcreteUnicornEmulator(self)
self.emu._stop_at = self._break_unicorn_at
try:
self.emu.emulate(insn)
except unicorn.UcError as e:
if e.errno == unicorn.UC_ERR_INSN_INVALID:
text_bytes = ' '.join('%02x' % x for x in insn.bytes)
logger.error("Unimplemented instruction: 0x%016x:\t%s\t%s\t%s",
insn.address, text_bytes, insn.mnemonic, insn.op_str)
raise InstructionEmulationError(str(e))

def backup_emulate(self, insn):
'''
If we could not handle emulating an instruction, use Unicorn to emulate
it.
:param capstone.CsInsn instruction: The instruction object to emulate
'''

emu = UnicornEmulator(self)
if not hasattr(self, 'backup_emu'):
self.backup_emu = UnicornEmulator(self)
try:
emu.emulate(insn)
self.backup_emu.emulate(insn)
except unicorn.UcError as e:
if e.errno == unicorn.UC_ERR_INSN_INVALID:
text_bytes = ' '.join('%02x' % x for x in insn.bytes)
Expand All @@ -888,7 +1011,7 @@ def emulate(self, insn):
# We have been seeing occasional Unicorn issues with it not clearing
# the backing unicorn instance. Saw fewer issues with the following
# line present.
del emu
del self.backup_emu

def render_instruction(self, insn=None):
try:
Expand Down
4 changes: 3 additions & 1 deletion manticore/native/cpu/cpufactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class CpuFactory:

@staticmethod
def get_cpu(mem, machine):
return CpuFactory._cpus[machine](mem)
cpu = CpuFactory._cpus[machine](mem)
mem.cpu = cpu
return cpu

@staticmethod
def get_function_abi(cpu, os, machine):
Expand Down
6 changes: 4 additions & 2 deletions manticore/native/cpu/x86.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,9 @@ def set_descriptor(self, selector, base, limit, perms):
assert base >= 0 and base < (1 << self.address_bit_size)
assert limit >= 0 and limit < 0xffff or limit & 0xfff == 0
# perms ? not used yet Also is not really perms but rather a bunch of attributes
self._publish('will_set_descriptor', selector, base, limit, perms)
self._segments[selector] = (base, limit, perms)
self._publish('did_set_descriptor', selector, base, limit, perms)

def get_descriptor(self, selector):
return self._segments.setdefault(selector, (0, 0xfffff000, 'rwx'))
Expand Down Expand Up @@ -4604,7 +4606,7 @@ def PMAXUB(cpu, dest, src):
PMAXUB: returns maximum of packed unsigned byte integers in the dest operand
Performs a SIMD compare of the packed unsigned byte in the second source operand
and the first source operand and returns the maximum value for each pair of
and the first source operand and returns the maximum value for each pair of
integers to the destination operand.
Example :
Expand Down Expand Up @@ -4981,7 +4983,7 @@ def PSLLD(cpu, op0, op1):
"""
PSLLD: Packed shift left logical with double words
Shifts the destination operand (first operand) to the left by the number of bytes specified
Shifts the destination operand (first operand) to the left by the number of bytes specified
in the count operand (second operand). The empty low-order bytes are cleared (set to all 0s).
If the value specified by the count operand is greater than 15, the destination operand is
set to all 0s. The count operand is an 8-bit immediate.
Expand Down
Loading

0 comments on commit bc77660

Please sign in to comment.