<a href="https://colab.research.google.com/github/tony-wade/Reverse-Engineering/blob/main/Find_the_MCU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal:
將ROM檔 or 截錄的燒錄資料轉換成.hex, 再由此code
來大致比對出chip的來源(ISA-opcode)。

注意: 第三方可能會自定義未定之operation

現成工具: https://dogbolt.org/?id=9cc1776c-a067-4752-bf50-6c457724252e#angr=14&Ghidra=9&Relyze=1231

In [None]:
import os
import numpy as np
import requests
import pandas as pd
from bs4 import BeautifulSoup
import openpyxl

In [None]:
# 爬蟲演示
import requests
from bs4 import BeautifulSoup

# URL
url = 'https://www.keil.com/support/man/docs/is51/is51_opcodes.asp?bhcp=1'

# 送出HTTP請求獲取網頁內容
response = requests.get(url)

# 使用BeautifulSoup解析內容
soup = BeautifulSoup(response.text, 'html.parser')

# 找到所有<tr>
rows = soup.find_all('tr', class_='kt')

# 建立一個字典
data_dict = {}

# 遍歷每一行數據
for row in rows:
    # 找到當前行中所有帶有指定class的<td>標籤
    td_elements = row.find_all('td', class_='ktC')

    # 確保有足夠的<td>標籤
    if len(td_elements) >= 2:
        # 將第一個值作為鍵，第二個值作為數據 無值=none
        key = td_elements[0].text.strip()
        data_str = td_elements[1].text.strip()
        if data_str:
            try:
                data = int(data_str)
            except ValueError:
                data = data_str
        else:
            data = None

        # 將鍵值對添加到字典中
        data_dict[key] = data

# 結果
print(data_dict)

{'00': 1, '01': 2, '02': 3, '03': 1, '04': 1, '05': 2, '06': 1, '07': 1, '08': 1, '09': 1, '0A': 1, '0B': 1, '0C': 1, '0D': 1, '0E': 1, '0F': 1, '10': 3, '11': 2, '12': 3, '13': 1, '14': 1, '15': 2, '16': 1, '17': 1, '18': 1, '19': 1, '1A': 1, '1B': 1, '1C': 1, '1D': 1, '1E': 1, '1F': 1, '20': 3, '21': 2, '22': 1, '23': 1, '24': 2, '25': 2, '26': 1, '27': 1, '28': 1, '29': 1, '2A': 1, '2B': 1, '2C': 1, '2D': 1, '2E': 1, '2F': 1, '30': 3, '31': 2, '32': 1, '33': 1, '34': 2, '35': 2, '36': 1, '37': 1, '38': 1, '39': 1, '3A': 1, '3B': 1, '3C': 1, '3D': 1, '3E': 1, '3F': 1, '40': 2, '41': 2, '42': 2, '43': 3, '44': 2, '45': 2, '46': 1, '47': 1, '48': 1, '49': 1, '4A': 1, '4B': 1, '4C': 1, '4D': 1, '4E': 1, '4F': 1, '50': 2, '51': 2, '52': 2, '53': 3, '54': 2, '55': 2, '56': 1, '57': 1, '58': 1, '59': 1, '5A': 1, '5B': 1, '5C': 1, '5D': 1, '5E': 1, '5F': 1, '60': 2, '61': 2, '62': 2, '63': 3, '64': 2, '65': 2, '66': 1, '67': 1, '68': 1, '69': 1, '6A': 1, '6B': 1, '6C': 1, '6D': 1, '6E': 1, 

In [None]:
def dict_8051(url):
    """
    爬蟲,建字典-8051
    """
    # 送HTTP請求
    response = requests.get(url)

    # BeautifulSoup解析
    soup = BeautifulSoup(response.text, 'html.parser')

    # 找到所有<tr>
    rows = soup.find_all('tr', class_='kt')

    # 建一字典
    data_dict = {}

    for row in rows:
        td_elements = row.find_all('td', class_='ktC')

        if len(td_elements) >= 2:
            # opcode as key, vacancy as data
            key = td_elements[0].text.strip()
            data_str = td_elements[1].text.strip()
            if data_str:
                try:
                    data = int(data_str)
                except ValueError:
                    data = data_str
            else:
                data = None     # empty

            # write in
            data_dict[key] = data

    return data_dict

In [None]:
def parser_8051(opcode, dictionary):
    """
    將 opcode 與8051字典比對，並返回對應的處理方式與操作數。

    Args:
        opcode (str): 8051 指令的 opcode。
        dictionary (dict): 包含 opcode 與其操作數的字典。

    Returns:
        tuple: - 第一個元素是字符串，表示操作類型，可能值為 'execute' 或 'undefined'。
            - 第二個元素是操作byte數，如果沒有則為 None。
    """
    if dictionary is None:
        raise ValueError(f"8051's dictionary is not provided")

    # 確認 opcode 是否存在於字典中
    if opcode not in dictionary:
        raise ValueError(f"Opcode {opcode} not found.")
    else:
        vacancy = dictionary[opcode]

        if not isinstance(vacancy, int):
            # 若為非整數，則檢查是否為自定義操作
            if opcode == 'A5':
                return 'undefined', None
            else:
                raise ValueError(f"Opcode {opcode}'s vacancy {vacancy} is invalid.")
        else:
            # 若操作數為整數，則返回 'execute' 類型與操作數
            return 'execute', vacancy

In [None]:
# https://onlinedocs.microchip.com/oxy/GUID-99287649-615D-4D29-913C-BC89F4A22253-en-US-7/GUID-1E155BE7-BCF6-4047-B40D-347D2F061AEE.html#GUID-1E155BE7-BCF6-4047-B40D-347D2F061AEE__ID-42120-00000180
# https://en.wikipedia.org/wiki/PIC_instruction_listings#PIC18_high_end_core_devices_(16_bit)
def parser_PIC18(opcode, rule=None, expect_commend=None):
    """
    Match PIC18 opcode and return corresponding handling method and operands.

    Args:
        opcode (str): The opcode of the PIC18 instruction.
        rule: Keep it None
        expect_commend (str): The opcode for multi execute.

    Returns:
        tuple: - The first element is a string indicating the operation type, which can be
                 'execute', 'undefined', 'multi execute: '.
            - The second element is the number of operation bytes, or None if there is none.
    """
    if expect_commend is not None:
        if opcode == expect_commend:
            return 'execute', 2   # F0
        elif opcode[0] == expect_commend:
            return 'execute', 2 if opcode[0]=='F' else 'multi execute: F0', 1   # F or 0
        else:
            return 'error', None

    if opcode[0] == 'F':
        return 'undefined', None
    elif opcode[0] == 'C' or opcode in ('E0', 'E1', 'EB', 'EF'):
        return 'multi execute: F', 2
    elif opcode == 'EE':
        return 'multi execute: 0', 1
    else:
        return 'execute', 2

In [None]:
# https://free-pdk.github.io/instruction-sets/PDK16
def parser_PDK16(opcode, rule=None, prev_commend=None):
    """
    Match PDK16 opcode and return corresponding handling method and operands.

    Args:
        opcode (str): The opcode string.
        rule (str): Rule to apply (unused).
        prev_command (str): Previous command.

    Returns:
        tuple: A tuple containing the action and the number of bytes to execute.

    """
    if prev_commend is not None:
        if prev_commend == '00':
            if opcode[0] == '2' or opcode == '01':
                return 'undefined', None
            elif opcode[1] != '0' and opcode[0] in ('6', '7'):
                return 'undefined', None
            else:
                return 'execute', 2
        elif prev_commend == '10':
            if opcode[0] in ("8", "9", "A", "B", "C", "D", "E", "F"):
                return 'undefined', None
            else:
                return 'execute', 2

    if opcode == '00':
        return 'multi execute: 00', 1
    elif opcode in ('01', '0C', '0D', '11', '12', '13'):
        return 'undefined', None
    elif opcode == '10':
        return 'multi execute: 10', 1
    else:
        return 'execute', 2

In [None]:
def generate_hex_values():
    """
    生成0~255的list, 各值為str表示hex
    """
    return [format(i, '02X') for i in range(256)] # 轉成hex,不含0x


def xor(data, hex_value):
    """
    XORs two hexadecimal values

    Args:
        data (str): The first hexadecimal value.
        hex_value (str): The second hexadecimal value.

    Returns:
        str: The result of XOR operation in uppercase string format.
    """
    return format((int(data, 16) ^ int(hex_value, 16)), '02X')


def process_cell_value(input_value, hex_value=None, mode=None):
    """
    統一數值樣式,並根據mode做改動
    """
    if hex_value and mode not in ['xor', 'not']:
        raise ValueError("Invalid mode. Mode must be 'xor'")

    cell_value = str(input_value)
    cleaned_value = cell_value.replace(' ', '').replace('0x', '')  # 去除空格, 0x
    uniform_value = cleaned_value.upper()  # 固定為英文大寫

    if mode == 'xor':
        # p.s. xor(0xFF) = bitwise not
        return xor(uniform_value, hex_value)

    return uniform_value


def parse_function(mcu):
    """
    依不同的MCU使用不同func.
    """
    if mcu == '8051':
        return parser_8051
    elif mcu == 'PIC18':
        return parser_PIC18
    elif mcu == 'PDK16':
        return parser_PDK16
    else:
        raise ValueError("Unsupported MCU:", mcu)


def process_row_data(row, mcu, rule=None, worksheet=None, hex_value=None, mode=None):
    """
    處理單行數據,完整通訊之區間片段應由一或多個完整操作所組成
    Returns:
        str: 若消去時有誤或指令不對時返回 'error'，如果是未定義則返回 'undefined'
    """
    row = row.dropna()    # 僅留有值者
    parser_func = parse_function(mcu)

    while len(row) > 0:
        # 根據mode做不同的數值處理
        opcode = process_cell_value(row[0], hex_value=hex_value, mode=mode)
        judication, vacancy = parser_func(opcode, rule)

        while judication.startswith('multi execute'):
            # multi classification process
            try:
                following_commend = judication.split(': ')[1]  # 分成兩str,取後者
                row = row.drop(range(vacancy)).reset_index(drop=True)
                opcode = process_cell_value(row[0], hex_value=hex_value, mode=mode)
                judication, vacancy = parser_func(opcode, rule, following_commend)
            except:
                return 'error'

        if judication == 'execute':
            try:
                # 消除一次指令
                row = row.drop(range(vacancy)).reset_index(drop=True)
            except:
                return 'error'

        else:
            # judication = 'undefined', record rest of the lines if needed
            if worksheet:
                worksheet.append(row.tolist())   # 將pandas series轉為list紀錄
            """
            else:
                # 8051限定,假設A5有定義
                try:
                    row = row.drop(range(1)).reset_index(drop=True)
                except:
                    return 'error'
            """
            return judication


def create_file(hex_value):
    folder_path = 'outputs'
    os.makedirs(folder_path, exist_ok=True)
    file_suffix = 'undefined_lines.xlsx'
    file_name = f'{hex_value}{file_suffix}' if hex_value else f'{file_suffix}'
    path = os.path.join(folder_path, file_name)

    workbook = openpyxl.Workbook()
    workbook.save(path)
    workbook = openpyxl.load_workbook(path)
    return workbook, path


def Error_Counter(row_data, mcu, rule=None, record_or_not=False, hex_value=None, mode=None):
    """
    計算/輸出錯誤和未定義操作的次數

    Args:
        row_data (pandas.DataFrame): 。
        mcu (str): 微控制器型號。
        rule: 操作碼規則。
        hex_value (str, optional): 十六進制值。預設為 None。
        mode (str, optional): 模式。預設為 None。

    Returns:
        None
    """
    if record_or_not is True:
        workbook,  file_name   = create_file(hex_value)
        worksheet = workbook.active
    else:
        worksheet = None

    err = 0
    undefined = 0
    for _, row in row_data.iterrows():
        result = process_row_data(row, mcu, rule, worksheet, hex_value, mode)
        if result == 'error':
            err += 1
        elif result == 'undefined':
            undefined += 1

        # 當錯誤過多則直接跳出,不輸出
        if err > 0:
            os.remove(file_name) if record_or_not is True else None
            return

    workbook.save(file_name) if record_or_not is True else None

    xor_string = None
    if hex_value:
        if mode == 'xor':
            xor_string = f'XOR with: {hex_value}'

    print(xor_string, end=' | ') if xor_string else print()  # 有值才印
    print(f'{mcu} Error: {err}; Undefined: {undefined}')


def test_mcu(file, mcu, record_or_not=False):
    """
    測試是否為特定型號的 MCU

    Parameters:
        file (DataFrame): 包含操作碼的數據文件
        mcu (str): MCU 型號，如 'PIC18', '8051', 'PDK16'
    """
    opcode_dict = None

    if mcu == '8051':
        url_8051 = 'https://www.keil.com/support/man/docs/is51/is51_opcodes.asp?bhcp=1'
        opcode_dict = dict_8051(url_8051)

    non_empty_rows = file.dropna(how='all')   # 去空行

    # 直接分析
    Error_Counter(non_empty_rows, mcu, opcode_dict, record_or_not)

    # 考慮簡易加密/Obfuscation
    hex_values = generate_hex_values()
    for value in hex_values:
        Error_Counter(non_empty_rows, mcu, opcode_dict, record_or_not, value, mode='xor')


In [None]:
def scan_excel(input_folder, mcu, record_undefined=False):
    """
    Scan files in the specified folder path.

    Args:
        input_folder (str): Path to the input folder.
        mcu (str): '8051', 'PIC18', 'PDK16'
    """
    for filename in os.listdir(input_folder):
        # Filter by file extension, only open xlsx files
        if filename.endswith(".xlsx"):
            file_path = os.path.join(input_folder, filename)
            print("Opening file:", file_path)

            # Read Excel
            # Note: header starts from 0, read as str
            df = pd.read_excel(file_path, header=None, dtype=str)

            # Reset header
            df.columns = range(df.shape[1])

            # Matching
            test_mcu(df, mcu, record_undefined)


# 定義輸入資料夾名
input_folder = "."   # 當下位置

In [None]:
# 設A5為1~3 bytes operation
scan_excel(input_folder=input_folder, mcu='8051')

Opening file: ./aa_55.xlsx

8051 Error: 0; Undefined: 0
XOR with: 00 | 8051 Error: 0; Undefined: 0
XOR with: 01 | 8051 Error: 0; Undefined: 0
XOR with: 02 | 8051 Error: 0; Undefined: 1
XOR with: 04 | 8051 Error: 0; Undefined: 0
XOR with: 05 | 8051 Error: 0; Undefined: 0
XOR with: 06 | 8051 Error: 0; Undefined: 1
XOR with: 08 | 8051 Error: 0; Undefined: 1
XOR with: 09 | 8051 Error: 0; Undefined: 1
XOR with: 0A | 8051 Error: 0; Undefined: 1
XOR with: 0B | 8051 Error: 0; Undefined: 1
XOR with: 0C | 8051 Error: 0; Undefined: 0
XOR with: 0D | 8051 Error: 0; Undefined: 0
XOR with: 0E | 8051 Error: 0; Undefined: 1
XOR with: 0F | 8051 Error: 0; Undefined: 1
XOR with: 10 | 8051 Error: 0; Undefined: 1
XOR with: 12 | 8051 Error: 0; Undefined: 1
XOR with: 13 | 8051 Error: 0; Undefined: 1
XOR with: 14 | 8051 Error: 0; Undefined: 1
XOR with: 15 | 8051 Error: 0; Undefined: 1
XOR with: 16 | 8051 Error: 0; Undefined: 0
XOR with: 17 | 8051 Error: 0; Undefined: 1
XOR with: 18 | 8051 Error: 0; Undefined: 

In [None]:
# 設A5為3 bytes operation
scan_excel(input_folder=input_folder, mcu='8051')

Opening file: ./hex_10%區段 _SPI.xlsx
XOR with: 03 | 8051 Error: 0; Undefined: 0
XOR with: 06 | 8051 Error: 0; Undefined: 0
XOR with: 25 | 8051 Error: 0; Undefined: 0
XOR with: 26 | 8051 Error: 0; Undefined: 0
XOR with: 50 | 8051 Error: 0; Undefined: 0
XOR with: 51 | 8051 Error: 0; Undefined: 0
XOR with: 53 | 8051 Error: 0; Undefined: 0
XOR with: 54 | 8051 Error: 0; Undefined: 0
XOR with: 55 | 8051 Error: 0; Undefined: 0
XOR with: 56 | 8051 Error: 0; Undefined: 0
XOR with: 57 | 8051 Error: 0; Undefined: 0
XOR with: 58 | 8051 Error: 0; Undefined: 0
XOR with: 5A | 8051 Error: 0; Undefined: 0
XOR with: 5B | 8051 Error: 0; Undefined: 0
XOR with: 7E | 8051 Error: 0; Undefined: 0
XOR with: 8D | 8051 Error: 0; Undefined: 0
XOR with: AD | 8051 Error: 0; Undefined: 0
XOR with: BD | 8051 Error: 0; Undefined: 0
XOR with: E9 | 8051 Error: 0; Undefined: 0
XOR with: EC | 8051 Error: 0; Undefined: 0
XOR with: FC | 8051 Error: 0; Undefined: 0
XOR with: FE | 8051 Error: 0; Undefined: 0


In [None]:
# 設A5為1,2 bytes operation
scan_excel(input_folder=input_folder, mcu='8051')

Opening file: ./hex_10%區段 _SPI.xlsx
XOR with: 03 | 8051 Error: 0; Undefined: 0
XOR with: 06 | 8051 Error: 0; Undefined: 0
XOR with: 0A | 8051 Error: 0; Undefined: 0
XOR with: 25 | 8051 Error: 0; Undefined: 0
XOR with: 26 | 8051 Error: 0; Undefined: 0
XOR with: 50 | 8051 Error: 0; Undefined: 0
XOR with: 51 | 8051 Error: 0; Undefined: 0
XOR with: 52 | 8051 Error: 0; Undefined: 0
XOR with: 53 | 8051 Error: 0; Undefined: 0
XOR with: 54 | 8051 Error: 0; Undefined: 0
XOR with: 55 | 8051 Error: 0; Undefined: 0
XOR with: 56 | 8051 Error: 0; Undefined: 0
XOR with: 57 | 8051 Error: 0; Undefined: 0
XOR with: 58 | 8051 Error: 0; Undefined: 0
XOR with: 5A | 8051 Error: 0; Undefined: 0
XOR with: 5B | 8051 Error: 0; Undefined: 0
XOR with: 7E | 8051 Error: 0; Undefined: 0
XOR with: 8D | 8051 Error: 0; Undefined: 0
XOR with: AD | 8051 Error: 0; Undefined: 0
XOR with: BD | 8051 Error: 0; Undefined: 0
XOR with: E9 | 8051 Error: 0; Undefined: 0
XOR with: EC | 8051 Error: 0; Undefined: 0
XOR with: FC | 805

In [None]:
# check
scan_excel(input_folder=input_folder, mcu='PDK16')

Opening file: ./hex_10% _SPI.xlsx

PDK16 Error: 0; Undefined: 9
XOR with: 00 | PDK16 Error: 0; Undefined: 9
XOR with: 01 | PDK16 Error: 0; Undefined: 9
XOR with: 02 | PDK16 Error: 0; Undefined: 9
XOR with: 03 | PDK16 Error: 0; Undefined: 9
XOR with: 04 | PDK16 Error: 0; Undefined: 9
XOR with: 05 | PDK16 Error: 0; Undefined: 9
XOR with: 06 | PDK16 Error: 0; Undefined: 9
XOR with: 07 | PDK16 Error: 0; Undefined: 9
XOR with: 08 | PDK16 Error: 0; Undefined: 9
XOR with: 09 | PDK16 Error: 0; Undefined: 9
XOR with: 0A | PDK16 Error: 0; Undefined: 9
XOR with: 0B | PDK16 Error: 0; Undefined: 9
XOR with: 0C | PDK16 Error: 0; Undefined: 9
XOR with: 0D | PDK16 Error: 0; Undefined: 9
XOR with: 0E | PDK16 Error: 0; Undefined: 9
XOR with: 0F | PDK16 Error: 0; Undefined: 9
XOR with: 10 | PDK16 Error: 0; Undefined: 9
XOR with: 11 | PDK16 Error: 0; Undefined: 9
XOR with: 12 | PDK16 Error: 0; Undefined: 9
XOR with: 13 | PDK16 Error: 0; Undefined: 9
XOR with: 14 | PDK16 Error: 0; Undefined: 9
XOR with: 15

In [None]:
scan_excel(input_folder=input_folder, mcu='PIC18')

Opening file: ./hex_10% _SPI.xlsx
XOR with: 20 | PIC18 Error: 0; Undefined: 9
XOR with: 21 | PIC18 Error: 0; Undefined: 9
XOR with: 22 | PIC18 Error: 0; Undefined: 9
XOR with: 23 | PIC18 Error: 0; Undefined: 9
XOR with: 24 | PIC18 Error: 0; Undefined: 9
XOR with: 25 | PIC18 Error: 0; Undefined: 9
XOR with: 26 | PIC18 Error: 0; Undefined: 9
XOR with: 27 | PIC18 Error: 0; Undefined: 9
XOR with: 28 | PIC18 Error: 0; Undefined: 9
XOR with: 29 | PIC18 Error: 0; Undefined: 9
XOR with: 2A | PIC18 Error: 0; Undefined: 9
XOR with: 2B | PIC18 Error: 0; Undefined: 9
XOR with: 2C | PIC18 Error: 0; Undefined: 9
XOR with: 2D | PIC18 Error: 0; Undefined: 9
XOR with: 2E | PIC18 Error: 0; Undefined: 9
XOR with: 2F | PIC18 Error: 0; Undefined: 9
XOR with: 40 | PIC18 Error: 0; Undefined: 9
XOR with: 41 | PIC18 Error: 0; Undefined: 9
XOR with: 42 | PIC18 Error: 0; Undefined: 9
XOR with: 43 | PIC18 Error: 0; Undefined: 9
XOR with: 44 | PIC18 Error: 0; Undefined: 9
XOR with: 45 | PIC18 Error: 0; Undefined: 