# Import thư viện

In [22]:
# # Import libraries
# !pip install py_stringsimjoin
# !pip install py_stringmatching
import json, copy, os, time
import pandas as pd
import py_stringmatching as sm
import py_stringsimjoin as ssj
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
from itertools import product

# Define measure and tokenizer

In [23]:
# JOIN_FUNCTIONS = {'COSINE': cosine_join,
#                   'DICE': dice_join,
#                   'EDIT_DISTANCE': edit_distance_join,
#                   'JACCARD': jaccard_join,
#                   'OVERLAP': overlap_join,
#                   'OVERLAP_COEFFICIENT': overlap_coefficient_join}

TOKENIZERS = {'SPACE_DELIMITER': DelimiterTokenizer(delim_set={' '}, return_set=True),
              '2_GRAM': QgramTokenizer(qval=2, padding=False, return_set=True),
              '3_GRAM': QgramTokenizer(qval=3, padding=False, return_set=True),
              '4_GRAM': QgramTokenizer(qval=4, padding=False, return_set=True),
              '5_GRAM': QgramTokenizer(qval=5, padding=False, return_set=True),
              '2_GRAM_BAG': QgramTokenizer(qval=2),
              '3_GRAM_BAG': QgramTokenizer(qval=3)
              }

# Define filters

In [26]:
FILTERS = {
    "OVERLAP_FILTER": ssj.OverlapFilter,
    "SIZE_FILTER": ssj.SizeFilter,
    "PREFIX_FILTER": ssj.PrefixFilter,
    "POSITION_FILTER": ssj.PositionFilter,
    "SUFFIX_FILTER": ssj.SuffixFilter
}

# Define Sim_function

In [25]:
SIM_FUNC = {
    'COSINE': sm.Cosine().get_sim_score,
    'DICE': sm.Dice().get_sim_score,
    'JACCARD': sm.Jaccard().get_sim_score,
    'OVERLAP_COEFFICIENT': sm.OverlapCoefficient().get_sim_score,
    'LEVENSHTEIN': sm.Levenshtein().get_sim_score,
    'TF-IDF': sm.TfIdf,
}

# Set directory and load script

In [27]:

DATA_PATH = os.sep.join([os.getcwd(), 'dataset'])
SCRIPTS_FILE = 'scripts_0.json'
BENCHMARK_DIRECTORY = 'benchmark_results'

In [28]:
def load_scripts(scripts_file_name):
    with open(scripts_file_name, 'r') as js_file:
        scripts = json.load(js_file)['scripts']
    return scripts

# Read data from file DBLP-ACM perfectMapping

In [29]:
import numpy as np
def caculate_true_positive(joined_table, true_label_table):
    joined_tuples = joined_table[["l_id", "r_id"]].to_records(index = False).astype(dtype=[('l_id', 'O'), ('r_id', 'O')])
    true_tuples = true_label_table[["l_id", "r_id"]].to_records(index = False).astype(dtype=[('l_id', 'O'), ('r_id', 'O')])
    return len(np.intersect1d(joined_tuples, true_tuples))
true_label_table = pd.read_csv("./dataset/library/DBLP-ACM_perfectMapping.csv", header = 0, encoding="ISO-8859-1", names = ['l_id', 'r_id'])
true_label_table

Unnamed: 0,l_id,r_id
0,conf/sigmod/SlivinskasJS01,375678
1,conf/sigmod/ChaudhuriDN01,375694
2,conf/sigmod/RinfretOO01,375669
3,conf/sigmod/BreunigKKS01,375672
4,conf/sigmod/JagadishJOT01,375687
...,...,...
2219,journals/sigmod/Scholl01,604275
2220,journals/sigmod/Rosneblatt94,190649
2221,journals/sigmod/Winslett02b,601871
2222,journals/sigmod/Labrinidis01,604283


# Test and compare with above table

In [30]:
def test(script, l_table, r_table, idx_script, scripts_file_name):
    result = []
    total_info_obj = product(script['sim_funcs'], script['sim_measure_types'],
                            script['tokenizers'], script['scale_filters'],
                            script['thresholds'], script['n_jobs'])
    for sim_funcs, sim_measure_type, tokenizer, scale_filter, threshold, n_jobs in total_info_obj:
        if tokenizer in ["SPACE_DELIMITER"] and sim_measure_type != 'EDIT_DISTANCE':
            continue
        sim_func = SIM_FUNC[sim_funcs]
        tok = TOKENIZERS[tokenizer]
        if scale_filter == "OVERLAP_FILTER":
            s_filter = FILTERS[scale_filter](tok, overlap_size=1, comp_op='>=', allow_missing=False)
        else:
            s_filter = FILTERS[scale_filter](tok, sim_measure_type, threshold, allow_empty=True, allow_missing=False)
        sum_time = 0
        start_time = time.time()
        print(s_filter)
        candidate_set = s_filter.filter_tables(
            l_table, r_table,
            script['l_id_attr'], script['r_id_attr'],
            script['l_join_attr'], script['r_join_attr'],
            l_out_attrs=None, r_out_attrs=None,
            l_out_prefix='l_', r_out_prefix='r_',
            n_jobs=n_jobs, show_progress=False)
        output_table = ssj.apply_matcher(candidate_set,
                                        'l_' + script['l_id_attr'], 'r_' + script['r_id_attr'], l_table, r_table,
                                        script['l_id_attr'], script['r_id_attr'],
                                        script['l_join_attr'], script['r_join_attr'],
                                        tokenizer=tok, sim_function=sim_func, threshold=threshold,
                                        comp_op='>=', allow_missing=True,
                                        l_out_attrs=[script['l_join_attr']], r_out_attrs=[script['r_join_attr']],
                                        l_out_prefix='l_', r_out_prefix='r_',
                                        out_sim_score=True, n_jobs=n_jobs, show_progress=False)
        sum_time += (time.time() - start_time)
        cand_set_size = len(candidate_set)
        if not os.path.exists(BENCHMARK_DIRECTORY):
            os.makedirs(BENCHMARK_DIRECTORY)
        if not os.path.exists(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script)])):
            os.makedirs(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script)]))
        output_table.to_csv(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script),
            scripts_file_name + sim_measure_type + '_' + tokenizer + str(threshold) + '_' + str(n_jobs) + scale_filter + '_' + sim_funcs + '_' + str(idx_script) + '.csv']))
        obj_res = {"idx_script" : idx_script, "table": copy.deepcopy(output_table), "time" : sum_time, "num_candidate": cand_set_size}
        obj_res["true_positive"] = caculate_true_positive(output_table, true_label_table)
        obj_res["false_positive"] = len(output_table) - obj_res["true_positive"]
        obj_res["false_negative"] = len(true_label_table) - obj_res["true_positive"]
        obj_res["recall"] = obj_res["true_positive"] / (obj_res["true_positive"] + obj_res["false_negative"])
        obj_res["precision"] = obj_res["true_positive"] / (obj_res["true_positive"] + obj_res["false_positive"])
        obj_res["F1"] = 2 * obj_res["precision"] * obj_res["recall"] / (obj_res["precision"] + obj_res["recall"])
        result.append(obj_res)
    return result

In [18]:
def load_data_and_test(scripts_file_name):
    result = []
    scripts = load_scripts(scripts_file_name)
    for idx, script in enumerate(scripts):
        l_path = os.sep.join([DATA_PATH, *script['ltable']])
        r_path = os.sep.join([DATA_PATH, *script['rtable']])
        l_table = pd.read_csv(l_path, encoding=script['ltable_encoding'])
        r_table = pd.read_csv(r_path, encoding=script['rtable_encoding'])
        result.append(test(script, l_table, r_table, idx, scripts_file_name))
    return result, l_table, r_table
#result là mảng chiều: chiều thứ nhất tương ứng với chỉ số script trong list các script
#chiều thứ hai là dictionary ứng với từng lần đo trong mỗi script 

In [31]:
result, l_table, r_table = load_data_and_test("scripts_0.json")

<py_stringsimjoin.filter.position_filter.PositionFilter object at 0x000001FC3FFC3C70>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.size_filter.SizeFilter object at 0x000001FC3FFC29B0>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.prefix_filter.PrefixFilter object at 0x000001FC3D0920B0>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.suffix_filter.SuffixFilter object at 0x000001FC3D092050>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.prefix_filter.PrefixFilter object at 0x000001FC3FFC3F40>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.prefix_filter.PrefixFilter object at 0x000001FC3D092050>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.prefix_filter.PrefixFilter object at 0x000001FC3FFC2DA0>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.position_filter.PositionFilter object at 0x000001FC3FFC3D60>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.position_filter.PositionFilter object at 0x000001FC3D29E740>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


<py_stringsimjoin.filter.position_filter.PositionFilter object at 0x000001FC3D29DD20>


  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


# Mã hóa result ra file pkl

In [35]:
import pickle
pickle_obj = {'result': result, 'l_table' : l_table, 'r_table':r_table}
pickle.dump(pickle_obj, open("script_0.pkl", "wb"))

In [36]:
load_obj = pickle.load(open("script_0.pkl", "rb"))
temp_result = load_obj['result']

# Show the metrics from the scripts

In [38]:
for x in range(0, len(temp_result)):
    precision_list = []
    recall_list = []
    f1_list = []
    time_list = []
    num_candidate_list = []
    for y in range(0, len(temp_result[x])):
        precision_list.append(temp_result[x][y]["precision"])
        recall_list.append(temp_result[x][y]["recall"])
        f1_list.append(temp_result[x][y]["F1"])
        time_list.append(temp_result[x][y]["time"])
        num_candidate_list.append(temp_result[x][y]["num_candidate"])
    if x > 0:
        data_frame = pd.DataFrame({"precision": precision_list, "recall": recall_list, "f1": f1_list, "time": time_list})
    else: 
        data_frame = pd.DataFrame({"time": time_list, "num_candidate": num_candidate_list})
    print(data_frame)
    print("\n")


         time  num_candidate
0    3.833089           4515
1   38.655310        2806383
2    1.698787          55697
3  209.562787         860760


   precision    recall        f1       time
0   0.851468  0.665018  0.746781  23.681954
1   0.845070  0.458633  0.594579  15.376251
2   0.912451  0.421763  0.576876   5.057908


   precision    recall        f1      time
0   0.911005  0.428058  0.582441  1.246634
1   0.912366  0.421313  0.576438  0.927507
2   0.912745  0.418615  0.573983  0.930105




* The first table is comparision of performance of 4 filters: position, prefix, size, suffix. The similarity measure is Jaccard, threshold 0.7.


* The second table is comparision of time and accuracy metrics by the change of threshold 0.7 -> 0.8 -> 0.9.

* The third table is comparision of time and accuracy metrics by the using 3-gram, 4-gram, 5-gram tokenizers with Jaccard similarity measure, threhold 0.8. 