In [3]:
import difflib
from ast import parse, NodeTransformer, copy_location, Name
import ast
import json
from tqdm.auto import tqdm

class NormIdentifiers(NodeTransformer):
    def __init__(self):
        self.identifiers = {}
        super().__init__()

    def visit_Name(self, node):
        try:
            id = self.identifiers[node.id]
        except KeyError:
            id = f'id_{len(self.identifiers)}'
            self.identifiers[node.id] = id

        return copy_location(Name(id=id), node)


def parse_code(code):
    tree = parse(code)  # parse code into the AST
    # tree = NormFunctions(func=None).visit(tree) # Normalize the function
    tree = NormIdentifiers().visit(tree)  # Normalize the identifiers
    d = ast.dump(tree)  # dump into string
    return d


def get_code_sim_score(code1, code2):
    tree1 = parse_code(code1)
    tree2 = parse_code(code2)
    pairs = difflib.SequenceMatcher(None, tree1, tree2).ratio()
    return pairs

In [2]:
a = 'a=1\nb=2\nk=a+b\nd=3\nc=k+d\nans=c*0.5\nprint(ans)'
b = 'a=1\nr=2\nk=a+r\nd=3\ncder=k+d\nans=c/4\nprint(ans)'
c = "apple=1\nr=2\nk=apple+r\nd=3\nc=k+d\nans=c*2\nprint(ans)"
print(get_code_sim_score(a, b))
print(get_code_sim_score(a, c))
print(get_code_sim_score(b, c))

In [4]:
with open('../data/final_preprocessed_data/js_rag_db_data_300.json', 'r', encoding='UTF-8') as f:
  db_data = json.load(f)

with open('../data/final_preprocessed_data/js_baseline_test_data_300.json', 'r', encoding='UTF-8') as f:
  test_data = json.load(f)

In [8]:
db_data[0]['diff']

In [9]:
print(get_code_sim_score(db_data[1]['diff'], test_data[0]['diff']))

In [None]:

similar_item = []
for test_item in tqdm(test_data, total=len(test_data), desc="Processing documents"):
  max_score = 0
  max_item = None
  for db_item in db_data:
    cur_score = get_code_sim_score(db_item['msg'], test_item['msg'])
    if cur_score > max_score:
      max_item = db_item
      max_score = cur_score
  similar_item.append(max_item)