In [304]:
import argparse
import itertools
import os
import pathlib
import subprocess
from collections import defaultdict
import json

from tqdm import tqdm
import pandas as pd

In [224]:
def get_bc_dirs(labeled_bc_dir):
    for sub_folder in ['mibench-cov', 'simple-cov']:
        for data_dir in (labeled_bc_dir / sub_folder).iterdir():
            yield data_dir


def get_all_bc_files(labeled_bc_dir):
    for data_dir in get_bc_dirs(labeled_bc_dir):
        for file in data_dir.iterdir():
            if file.name.endswith('.bc'):
                yield file


def read_instructions(file_path):
    with open(file_path) as inp:
        return [line.strip() for line in inp if line.strip() != '']


def split_blocks(g_instructions):
    buff = []
    for instruction in g_instructions:
        buff.append(instruction)
        if instruction.startswith('br') or instruction.startswith('ret') or instruction.startswith('unreachable'):
            yield buff
            buff = []


def parse_ir_instructions(block_text):
    res = [x.strip() for x in block_text.strip().split('|.|') if x.strip() != '']
    res = [x for x in res if not (x.startswith('%.reg2mem') or x.startswith('%"reg2mem'))]
    return res


def map_general_instructions(bc_dir, df):
    new_column = []
    for bc_name, p_df in df.groupby('program'):
        res = []
        g_instructions = read_instructions(bc_dir / bc_name.replace('.bc', '.gin'))
        block_g_instructions = list(split_blocks(g_instructions))
        block_ir_instructions = list(map(parse_ir_instructions, p_df['w_63']))
        ii = 0
        ig = 0
        while ii < len(block_ir_instructions):
            ir_instr = block_ir_instructions[ii]
            g_instr = block_g_instructions[ig]
            if ir_instr[0] == 'unreachable':
                res.append(ir_instr)
                ii += 1
                ir_instr = block_ir_instructions[ii]
                while len(ir_instr) != len(g_instr):
                    ig += 1
                    g_instr = block_g_instructions[ig]

            assert len(ir_instr) - len(g_instr) <= 1

            res.append(g_instr)
            ii += 1
            ig += 1
        new_column.append(res)
    res = flatten(new_column)
    return res


def flatten(a_list):
    return list(itertools.chain.from_iterable(a_list))


def update_block_df(bc_dir, df):
    df['w_64'] = map_general_instructions(bc_dir, df)


def update_block_files(labeled_bc_dir):
    for bc_dir in get_bc_dirs(labeled_bc_dir):
        blocks_file = bc_dir / 'blocks.csv'
        column_names = ['uid'] + ['w_{}'.format(ii) for ii in range(64)] + ['program'] + ['subject']
        df = pd.read_csv(
            blocks_file, lineterminator='\r', sep=';', header=None,
            index_col=False, dtype={'uid': object}, names=column_names
        ).reset_index()
        update_block_df(bc_dir, df)

In [237]:
blocks_file = pathlib.Path('/media/nika/TOSHIBA EXT/paperback/LABELED-BCs/simple-cov/NONE/blocks.csv')

In [238]:
column_names = ['uid'] + ['w_{}'.format(ii) for ii in range(64)] + ['program'] + ['subject']

df = pd.read_csv(
    blocks_file, lineterminator='\r', sep=';', header=None,
    index_col=False, dtype={'uid': object}, names=column_names
)

In [243]:
a_none_df = df[df['program'] == 'anagram-NONE.bc']
update_block_df(blocks_file.parent, a_none_df)
anagram_none_ir, anagram_none_gin = list(a_none_df['w_63']), list(a_none_df['w_64'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['w_64'] = map_general_instructions(bc_dir, df)


In [247]:
a_oh_df = df[df['program'] == 'anagram-OH-NONE.bc']
update_block_df(blocks_file.parent, a_oh_df)
anagram_oh_ir, anagram_oh_gin = list(a_oh_df['w_63']), list(a_oh_df['w_64'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['w_64'] = map_general_instructions(bc_dir, df)


In [248]:
len(anagram_oh_ir), len(anagram_oh_gin)

(108, 108)

In [269]:
def read_vocab(file_path):
    with open(file_path) as inp:
        res = {}
        for line in map(str.strip, inp):
            if line != '':
                key, val = line.split(':')
                res[key] = json.loads(val.strip(','))
        return res

In [270]:
vocab = read_vocab('seedEmbeddingVocab-300-llvm10.txt')

In [292]:
def parse_ir_cmd(text):
    if ' = ' in text:
        text = text.split(' = ')[1]
    return text.split()[0]

In [308]:
def get_test_samples():
    for irs, gis in zip(map(parse_ir_instructions, anagram_oh_ir), anagram_oh_gin):
        for i, gi in zip(irs, gis):
            yield i, gi
    
    for irs, gis in zip(map(parse_ir_instructions, anagram_none_ir), anagram_none_gin):
        for i, gi in zip(irs, gis):
            yield i, gi

In [314]:
all_commands = []

for i, gi in get_test_samples():
    ir_cmd = parse_ir_cmd(i)
    all_commands.append(ir_cmd)
    if ir_cmd != gi.split()[0]:
        print(ir_cmd, '------', gi.split()[0])

print(set(all_commands))

br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
br ------ unreachable
{'ret', 'store', 'br', 'icmp', 'unreachable', 'getelementptr', 'call', 'xor', 'sext', 'zext', 'bitcast', 'add', 'load', 'alloca', 'sub'}


In [315]:
dt = []

for i, gi in get_test_samples():
    if ' = ' in i:
        i = i.split(' = ')[1]
    print(i, '------', gi)

alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
alloca i64*, align 8 ------ alloca  pointerTy  constant
alloca i64, align 8 ------ alloca  pointerTy  constant
a

load i64, i64* %20, align 8 ------ load  integerTy  pointer
load i64, i64* %2, align 8 ------ load  integerTy  pointer
add nsw i64 %21, %22 ------ add  integerTy  variable  variable
load i64*, i64** %1, align 8 ------ load  pointerTy  pointer
store i64 %23, i64* %24, align 8 ------ store  voidTy  variable  pointer
load i64, i64* %10 ------ load  integerTy  pointer
store i64 %25, i64* @1 ------ store  voidTy  variable  pointer
store i64* @1, i64** %5, align 8 ------ store  voidTy  pointer  pointer
store i64 2000000000000, i64* %6, align 8 ------ store  voidTy  constant  pointer
load i64*, i64** %5, align 8 ------ load  pointerTy  pointer
load i64, i64* %26, align 8 ------ load  integerTy  pointer
load i64, i64* %6, align 8 ------ load  integerTy  pointer
icmp ne i64 %27, %28, !oh_verify !39 ------ icmp  integerTy  variable  variable
br i1 %29, label %30, label %assert.exit ------ br  voidTy  variable  label  label
call void @exit(i32 1) #5 ------ call  voidTy  constant  function
unreach

sext i32 %36 to i64, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ sext  integerTy  variable
getelementptr inbounds i8, i8* %35, i64 %37, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ getelementptr  pointerTy  pointer  variable
load i8, i8* %38, align 1, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ load  integerTy  pointer
sext i8 %39 to i32, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ sext  integerTy  variable
icmp ne i32 %40, 0, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ icmp  integerTy  variable  constant
br i1 %41, label %42, label %56, !data_dep_instr !6, !argument_dep_instr !8, !control_dep_instr !7 ------ br  voidTy  variable  label  label
load i8*, i8** %5, align 8, !data_dep_instr !6, !argument_dep_instr !8, !input_dep_block !11, !control_dep_instr !7 ------ load  pointerTy  pointer
load i32, i32* %8, align 4, !data_dep_instr !6, !argumen

In [317]:
with open('ir.txt', 'w', encoding='utf-8') as ir_out:
    with open('gir.txt', 'w', encoding='utf-8') as gir_out:
        for i, gi in get_test_samples():
            ir_out.write(i)
            ir_out.write('\n')
            gir_out.write(gi)
            gir_out.write('\n')