In [1]:
import re  

# 定义一个函数来提取以制表符开头的行中的第一个单词  
def extract_first_word_from_indented_lines(filename):  
    words = set()  # 使用集合来自动去重  
  
    with open(filename, 'r') as file:  
        for line in file:  
            # 检查行是否以制表符开头  
            if line.startswith('\t'):  
                # 查找制表符后面的第一个单词  
                parts = line.split()  
                if parts:  
                    # 提取第一个单词（指令），并添加到集合中  
                    first_word = parts[0]  
                    words.add(first_word)  # 直接添加 first_word 到集合中  
  
    return words 
  
# 定义需要处理的.s文件列表  
s_files = ['att2_8.1.0.s', 'att2_13.2.0.s']  
  
# 提取所有文件中的单词，并合并到一个集合中去重  
all_words = set()  
for s_file in s_files:  
    all_words.update(extract_first_word_from_indented_lines(s_file))  
  
# 将去重后的单词按行输出到lan.txt文件中  
with open('lan.txt', 'w') as out_file:  
    for word in sorted(all_words):  # 排序输出（可选）  
        out_file.write(word + '\n')  
  
print("Words have been extracted, deduplicated, and written to ang.txt.")

Words have been extracted, deduplicated, and written to ang.txt.


In [6]:
import re  
from collections import defaultdict  
  
# 读取lan.txt文件，获取需要统计的汇编指令列表  
def read_instructions(filename):  
    with open(filename, 'r') as file:  
        instructions = {line.strip() for line in file if line.strip()}  
    return instructions  
  
# 读取.s文件，统计每个部分中指令的出现次数  
def count_instructions_per_section(s_file, instructions):  
    section_pattern = re.compile(r'^(\w+):')  # 匹配部分名称  
    instruction_pattern = re.compile(r'\b(\w+)\b')  # 匹配指令（简化版）  
      
    current_section = None  
    section_instructions = defaultdict(lambda: defaultdict(int))  
      
    with open(s_file, 'r') as file:  
        for line in file:  
            section_match = section_pattern.match(line)  
            if section_match:  
                current_section = section_match.group(1)  
                continue  
              
            if not current_section:  
                continue  # 跳过未定义部分的行  
              
            for match in instruction_pattern.finditer(line):  
                instruction = match.group(1)  
                if instruction in instructions:  
                    section_instructions[current_section][instruction] += 1  
      
    return section_instructions  
  

In [7]:
def compare_section_instructions(s_files, instructions):  
    # 假设 instructions 是一个从文件名到指令计数字典的映射  
  
    # 为每一个文件计算指令计数  
    section_instructions = {}  
    for s_file in s_files:  
        section_instructions[s_file] = count_instructions_per_section(s_file, instructions)  
  
    # 选择第一个文件作为基准  
    base_section_instructions = section_instructions[s_files[0]]  
    base_sections = set(base_section_instructions.keys())  
    base_instructions = set(instruction for section in base_section_instructions for instruction in base_section_instructions[section])  
  
    # 初始化特征向量和标签列表  
    features = []  
    labels = []  
  
    # 对于每个文件，生成特征向量和标签  
    for i, s_file in enumerate(s_files):  
        current_section_instructions = section_instructions[s_file]  
        current_sections = set(current_section_instructions.keys())  
        current_instructions = set(instruction for section in current_section_instructions for instruction in current_section_instructions[section])  
  
        # 初始化特征向量，长度为基准文件的部分数和指令数之和的两倍（因为有存在与否两个特征）  
        feature_vec = [0] * (2 * (len(base_sections) + len(base_instructions)))  
  
        # 检查不同的部分（前一半特征向量）  
        for j, section in enumerate(sorted(base_sections)):  
            if section not in current_sections:  
                feature_vec[j] = 1  # 文件缺少该部分  
  
        # 检查指令计数（后一半特征向量）  
        for j, instruction in enumerate(sorted(base_instructions), start=len(base_sections)):  
            if instruction in current_instructions:  
                if instruction in current_section_instructions[section] and current_section_instructions[section][instruction] != base_section_instructions[section][instruction]:  
                    feature_vec[j + len(base_sections)] = 1  # 指令计数不同  
            else:  
                feature_vec[j + len(base_sections)] = 1  # 文件缺少该指令  
  
        # 添加特征向量到列表  
        features.append(feature_vec)  
  
        # 根据差异情况设置标签（这里简化处理，可以根据实际差异情况设置更复杂的标签）  
        if any(feature_vec[:len(base_sections)]) or any(feature_vec[len(base_sections):]):  
            label = 2 if i > 0 else 1  # 非基准文件至少为2，基准文件为1  
            if any(feature_vec[len(base_sections):]):  
                label = 3  # 如果指令计数有差异，则升级为3  
        else:  
            label = 1  # 无差异则为1  
  
        # 添加标签到列表  
        labels.append(label)  
  
    # 返回特征和标签列表，用于训练决策树  
    return features, labels

In [9]:
import pandas as pd  
from sklearn.tree import DecisionTreeClassifier  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score 
import numpy as np  
  
# 假设我们有以下训练数据（实际情况中，你需要根据具体情况准备这些数据）  
# features_train: 特征向量，每一行代表一个文件的特征  
# labels_train: 类别标签，与features_train中的文件一一对应  
  
# features_train = np.array([...]) # 示例：[是否有不同部分, 是否有不同指令计数]  
# labels_train = np.array([...]) # 示例：["1", "2", "3", ...]  
  
# 定义一个函数，使用训练好的决策树模型对新的文件名进行分类  
def classify_files_with_tree(s_files, different_sections, different_instructions, clf):  
    classifications = {}  
      
    # 将文件差异转换为特征向量  
    for s_file in s_files:  
        features = []  
          
        # 检查是否存在不同的部分  
        if s_file in different_sections and different_sections[s_file]:  
            features.append(1)  
        else:  
            features.append(0)  
          
        # 检查是否存在不同的指令计数  
        if s_file in different_instructions and different_instructions[s_file]:  
            features.append(1)  
        else:  
            features.append(0)  
          
        # 将特征向量转换为numpy数组，并预测类别  
        features_array = np.array(features).reshape(1, -1)  
        category = clf.predict(features_array)[0]  
          
        # 将文件及其类别添加到分类字典中  
        classifications[s_file] = category  
      
    return classifications

In [20]:
# 主函数  
def main():  
    lan_filename = 'lan.txt'  
    assembly_instructions = read_instructions(lan_filename)  
      
    # 假设我们有一个包含所有.s文件名的列表  
    s_files = ['att2_8.1.0.s', 'att2_13.2.0.s']  
      
    for s_file in s_files:  
        section_instructions = count_instructions_per_section(s_file, assembly_instructions)  
        print(f"Instruction counts in {s_file}:")  
        for section, instructions in section_instructions.items():  
            print(f"Section '{section}':")  
            for instruction, count in instructions.items():  
                print(f"  {instruction}: {count}")  
            print()  
            
    # 比较所有.s文件中的指令计数  
    features, labels = compare_section_instructions(s_files, instructions) 

    # 划分数据集为训练集和测试集  
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)  
      
    # 创建决策树分类器实例  
    clf = DecisionTreeClassifier()  
      
    # 训练模型  
    clf.fit(X_train, y_train)  
      
    # 在测试集上进行预测并计算准确率  
    y_pred = clf.predict(X_test)  
    accuracy = accuracy_score(y_test, y_pred)  
    print(f"cata: {y_pred}")  
  
if __name__ == '__main__':  
    main()

Instruction counts in att2_8.1.0.s:
Section '_Z4initv':
  pushq: 1
  movq: 2
  subq: 1
  movl: 12
  cmpl: 4
  jg: 4
  cltq: 3
  movslq: 1
  salq: 2
  addq: 3
  leaq: 3
  movb: 4
  addl: 6
  jmp: 4
  nop: 1
  popq: 1
  ret: 1

Section '_Z7displayv':
  pushq: 1
  movq: 13
  subq: 1
  leaq: 6
  call: 9
  movl: 5
  cmpl: 2
  jg: 2
  cltq: 1
  movslq: 1
  salq: 1
  addq: 3
  movzbl: 1
  movsbl: 1
  addl: 2
  jmp: 2
  nop: 1
  popq: 1
  ret: 1

Section '_Z5placeb':
  pushq: 2
  subq: 1
  leaq: 23
  movl: 6
  movb: 5
  movq: 25
  call: 24
  movzbl: 24
  xorl: 2
  testb: 2
  je: 5
  cmpb: 8
  jg: 3
  jle: 1
  movsbl: 13
  leal: 5
  subl: 5
  cltq: 5
  movslq: 5
  salq: 5
  addq: 11
  jne: 1
  jmp: 1
  nop: 1
  popq: 2
  ret: 1

Section '_Z7computev':
  pushq: 1
  movq: 1
  subq: 1
  movl: 162
  cmpl: 5
  jg: 4
  cltq: 74
  movslq: 74
  salq: 74
  addq: 149
  leaq: 74
  movzbl: 74
  cmpb: 42
  je: 1
  jne: 42
  addl: 29
  jmp: 13
  leal: 24
  popq: 1
  ret: 1

Section 'main':
  pushq: 1
  movq: