In [12]:
import json
import subprocess
import difflib
import argparse
import os
import shlex
import difflib

import difflib

def get_diff_lines(original_file, debloated_file,folder,out="removed.json"):

    """Returns a list of (line_number, removed_line) from original_file that are missing in debloated_file."""
    with open(folder+original_file, "r") as f1, open(folder+debloated_file, "r") as f2:
        original_lines = f1.readlines()
        debloated_lines = f2.readlines()

    differ = difflib.Differ()
    diff = list(differ.compare(original_lines, debloated_lines))

    removed_lines = []
    original_line_number = 0  # 1-based index

    for line in diff:
        if line.startswith("  "):  # Unchanged line, increment line counter
            original_line_number += 1
        elif line.startswith("- "):  # Line removed from the original file
            removed_lines.append((original_line_number + 1, line[2:].strip()))  # Store 1-based line number
            original_line_number += 1  # Increase to match actual file structure
    lines= [l[0] for l in removed_lines]
    
    with open(folder+out, "w") as outfile:
        json.dump(lines, outfile)
    return removed_lines
    return removed_lines


def compile_and_run(file, folder,arg=""):
    """Compiles and runs the original program with coverage tracking using Clang, passing an optional argument and printing output."""
    executable = "./binary"

    # Compile the program with coverage flags using Clang
    print(f"Compiling {file} with Clang...")
    compile_process = subprocess.run(

        ["clang-12", "-fprofile-arcs", "-ftest-coverage", "-o", executable, file],

        check=True, capture_output=True, text=True,cwd=folder
    )
    print(compile_process.stdout)
    print(compile_process.stderr)
    #run_process = subprocess.run(f"chmod +x {executable}" , check=True, capture_output=False, text=True,cwd=folder)
    
    
    args = [executable] + shlex.split(arg)
    
    # Run the compiled program with the provided argument
    print(f"Running the program with argument: {arg}...")
    run_process = subprocess.run(args, check=True, capture_output=False, text=True,cwd=folder)
    print(run_process.stdout)
    print(run_process.stderr)
    
    # Generate coverage report (Clang uses gcov-compatible output when using GNU coverage flags)
    gcov_file = file.split("/")[-1]
    print(f"Generating coverage report for {gcov_file}...")
    gcov_process = subprocess.run(["llvm-cov-12", "gcov", gcov_file], check=True, capture_output=True, text=True,cwd=folder)
    print(gcov_process.stdout)
    print(gcov_process.stderr)

def get_executed_lines(gcov_file,folder,out="executed_lines.json",):
    """Extracts executed line numbers from gcov output."""
    executed_lines = set()
    with open(folder+gcov_file, "r") as f:
        for line in f:
            parts = line.strip().split(":")
            if len(parts) > 1 and parts[0].strip().isdigit() :
            #parts[0].strip() # Check if the line number is a digit ie if it was executed
                executed_lines.add((parts[1].strip(),parts[2].strip()))
    lines= [l[0] for l in executed_lines]

    with open(folder+out, "w") as outfile:
        json.dump(lines, outfile)

    return executed_lines

def filter_executed_diff(diff_lines, executed_lines,folder, output_file="filtered_diff.txt"):

    executed_lines = set([line[0] for line in executed_lines])

    """Filters removed lines to only those that were executed in the original program and saves them to a file."""
    filtered_lines = [diff_line for diff_line in diff_lines if str(diff_line[0]) in executed_lines]
    
    with open(folder+output_file, "w") as f:
        f.writelines(f" {line[0]}-{line[1]} + \n" for line in filtered_lines)
    
    return filtered_lines


def prepend_lines(input_file, lines_to_prepend, output_file,folder ,sep="[EXEC]"):
    # Read the content of the input file
    with open(folder+input_file, 'r') as infile:
        content = infile.readlines()
    
    # Create a new list with prepended lines
    modified_content = []
    i=1
    for line in content:
        if i in lines_to_prepend:
            modified_content.append(f"{sep}{line} ")
        else:
            modified_content.append(line)
        i=i+1
    # Save the modified content to the output file
    with open(folder+output_file, 'w') as outfile:
        outfile.writelines(modified_content)


def run(original_file,debloated_file,command,dir):
    print("Computing diff")
    diff_lines = get_diff_lines(original_file, debloated_file,dir)
    print("Compiling / coverage of original")

    compile_and_run(original_file, dir,command)
    print("Compiling / coverage of debloated")

    compile_and_run(debloated_file, dir,command)
    print("Get executed line")
    executed_lines = get_executed_lines(original_file+".gcov",dir,original_file+"_executed.json")
    print("Get chisel executed line")

    executed_lines_chisel = get_executed_lines(debloated_file+".gcov",dir,debloated_file+"_executed.json")
    executed_lines_chisel
    d=set([x[0] for x in filter_executed_diff(diff_lines, executed_lines,dir)])
    executed_line_nb_chisel =set([int(x[0]) for x in executed_lines_chisel ])
    executed_line_nb = set([int(x[0]) for x in executed_lines ])

    executed_line_nb_chisel_filtered = []
    executed_lines_str =set([x[1] for x in executed_lines ])
    for x in executed_lines_chisel:
        if x[1] not in executed_lines_str :
            executed_line_nb_chisel_filtered.append(int(x[0]))
    annotated_original ="annotated_"+original_file
    annotated_debloated="annotated_"+debloated_file
    prepend_lines(original_file, d, annotated_original,dir,"[EXEC_IN_ORI_BUT DELETED_IN_DEBLOATED]")
    prepend_lines(debloated_file, executed_line_nb_chisel,annotated_debloated ,dir,"[EXEC]")
    prepend_lines(annotated_debloated, executed_line_nb_chisel_filtered, annotated_debloated,dir,"[EXEC_IN_DEBLOAT_BUT_NOT_EXEC_IN_ORI]")
    prepend_lines(annotated_original, executed_line_nb, annotated_original,dir,"[EXEC]")


In [13]:
dir="./grep-2.4.2/"
original_file =  "grep-2.4.2.c"
debloated_file = "chisel_grep-2.4.2_p0.2train.c"
command = "-E 'Include|n{1}.lude' ./input/grep0.dat ./input/grep1.dat ./input/grepNull.dat"    
run(original_file,debloated_file,command,dir)
    

Computing diff
Compiling / coverage of original
Compiling grep-2.4.2.c with Clang...

    printf((char const   * __restrict  )tmp___3);
           ^                            ~~~~~~~
grep-2.4.2.c:1934:12: note: treat the string as an argument to avoid this
    printf((char const   * __restrict  )tmp___3);
           ^
           "%s", 
    printf((char const   * __restrict  )tmp___4);
           ^                            ~~~~~~~
grep-2.4.2.c:1936:12: note: treat the string as an argument to avoid this
    printf((char const   * __restrict  )tmp___4);
           ^
           "%s", 
    printf((char const   * __restrict  )tmp___5);
           ^                            ~~~~~~~
grep-2.4.2.c:1938:12: note: treat the string as an argument to avoid this
    printf((char const   * __restrict  )tmp___5);
           ^
           "%s", 
    printf((char const   * __restrict  )tmp___6);
           ^                            ~~~~~~~
grep-2.4.2.c:1940:12: note: treat the string as an argume

In [14]:
dir="gzip-1.3/"
original_file =  "gzip-1.3.c"
debloated_file = "chisel_gzip-1.3_p0.2train.c"
command = "-c ./input/34file "
run(original_file,debloated_file,command,dir)

Computing diff
Compiling / coverage of original
Compiling gzip-1.3.c with Clang...

  outbuf[tmp] = (uch )*("\037\213" + 0);
                        ~~~~~~~~~~~^~~
  outbuf[tmp] = (uch )*("\037\213" + 0);
                                   ^
                        &          [  ]
  outbuf[tmp___0] = (uch )*("\037\213" + 1);
                            ~~~~~~~~~~~^~~
  outbuf[tmp___0] = (uch )*("\037\213" + 1);
                                       ^
                            &          [  ]
  o = -1L - (-1L << (sizeof(off_t ) * 8UL - 1UL));
             ~~~ ^
    trunc = strrchr((char const   *)name, (int )*("." + 0));
                                                  ~~~~^~~
    trunc = strrchr((char const   *)name, (int )*("." + 0));
                                                      ^
                                                  &   [  ]

Running the program with argument: -c ./input/34file ...
None
None
Generating coverage report for gzip-1.3.c...
File 'gzip-1.3.c'
Line