# Import thư viện

In [2]:
# # Import libraries
# !pip install py_stringsimjoin
# !pip install py_stringmatching
import json
import os
import time
import pandas as pd
import py_stringmatching as sm
import py_stringsimjoin as ssj
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
from itertools import product

# Define measure and tokenizer

In [3]:
# JOIN_FUNCTIONS = {'COSINE': cosine_join,
#                   'DICE': dice_join,
#                   'EDIT_DISTANCE': edit_distance_join,
#                   'JACCARD': jaccard_join,
#                   'OVERLAP': overlap_join,
#                   'OVERLAP_COEFFICIENT': overlap_coefficient_join}

TOKENIZERS = {'SPACE_DELIMITER': DelimiterTokenizer(delim_set={' '}, return_set=True),
              '2_GRAM': QgramTokenizer(qval=2, padding=False, return_set=True),
              '3_GRAM': QgramTokenizer(qval=3, padding=False, return_set=True),
              '4_GRAM': QgramTokenizer(qval=4, padding=False, return_set=True),
              '5_GRAM': QgramTokenizer(qval=5, padding=False, return_set=True),
              '2_GRAM_BAG': QgramTokenizer(qval=2),
              '3_GRAM_BAG': QgramTokenizer(qval=3)
              }

# Define filters

In [4]:
FILTERS = {
    "OVERLAP_FILTER": ssj.OverlapFilter,
    "SIZE_FILTER": ssj.SizeFilter,
    "PREFIX_FILTER": ssj.PrefixFilter,
    "POSITION_FILTER": ssj.PositionFilter,
    "SUFFIX_FILTER": ssj.SuffixFilter
}

# Define Sim_function

In [5]:
SIM_FUNC = {
    'COSINE': sm.Cosine().get_sim_score,
    'DICE': sm.Dice().get_sim_score,
    'JACCARD': sm.Jaccard().get_sim_score,
    'OVERLAP_COEFFICIENT': sm.OverlapCoefficient().get_sim_score,
    'LEVENSHTEIN': sm.Levenshtein().get_sim_score,
    'TF-IDF': sm.TfIdf,
}

# Set directory and load script

In [6]:

DATA_PATH = os.sep.join([os.getcwd(), 'dataset'])
SCRIPTS_FILE = 'scripts_0.json'
BENCHMARK_DIRECTORY = 'benchmark_results'

In [84]:
def load_scripts(scripts_file_name):
    with open(scripts_file_name, 'r') as js_file:
        scripts = json.load(js_file)['scripts']
    return scripts

In [86]:
def test(script, l_table, r_table, idx_script, scripts_file_name):
    result = []
    total_info_obj = product(script['sim_funcs'], script['sim_measure_types'],
                            script['tokenizers'], script['scale_filters'],
                            script['thresholds'], script['n_jobs'])
    for sim_funcs, sim_measure_type, tokenizer, scale_filter, threshold, n_jobs in total_info_obj:
        if tokenizer in ["SPACE_DELIMITER"] and sim_measure_type != 'EDIT_DISTANCE':
            continue
        sim_func = SIM_FUNC[sim_funcs]
        tok = TOKENIZERS[tokenizer]
        if scale_filter == "OVERLAP_FILTER":
            s_filter = FILTERS[scale_filter](tok, overlap_size=1, comp_op='>=', allow_missing=False)
        else:
            s_filter = FILTERS[scale_filter](tok, sim_measure_type, threshold, allow_empty=True, allow_missing=False)
        sum_time = 0
        start_time = time.time()
        candidate_set = s_filter.filter_tables(
            l_table, r_table,
            script['l_id_attr'], script['r_id_attr'],
            script['l_join_attr'], script['r_join_attr'],
            l_out_attrs=None, r_out_attrs=None,
            l_out_prefix='l_', r_out_prefix='r_',
            n_jobs=n_jobs, show_progress=False)
        output_table = ssj.apply_matcher(candidate_set,
                                        'l_' + script['l_id_attr'], 'r_' + script['r_id_attr'], l_table, r_table,
                                        script['l_id_attr'], script['r_id_attr'],
                                        script['l_join_attr'], script['r_join_attr'],
                                        tokenizer=tok, sim_function=sim_func, threshold=threshold,
                                        comp_op='>=', allow_missing=True,
                                        l_out_attrs=[script['l_join_attr']], r_out_attrs=[script['r_join_attr']],
                                        l_out_prefix='l_', r_out_prefix='r_',
                                        out_sim_score=True, n_jobs=n_jobs, show_progress=False)
        sum_time += (time.time() - start_time)
        cand_set_size = len(candidate_set)
        if not os.path.exists(BENCHMARK_DIRECTORY):
            os.makedirs(BENCHMARK_DIRECTORY)
        if not os.path.exists(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script)])):
            os.makedirs(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script)]))
        output_table.to_csv(os.sep.join([BENCHMARK_DIRECTORY, str(idx_script),
            scripts_file_name + sim_measure_type + '_' + tokenizer + str(threshold) + '_' + str(n_jobs) + scale_filter + '_' + sim_funcs + '_' + str(idx_script) + '.csv']))
        result.append({"script": script, "idx_script" : idx_script, "table": output_table, "time" : sum_time, "num_candidate": cand_set_size})
    return result

In [87]:
def load_data_and_test(scripts_file_name):
    result = []
    scripts = load_scripts(scripts_file_name)
    for idx, script in enumerate(scripts):
        l_path = os.sep.join([DATA_PATH, *script['ltable']])
        r_path = os.sep.join([DATA_PATH, *script['rtable']])
        l_table = pd.read_csv(l_path, encoding=script['ltable_encoding'])
        r_table = pd.read_csv(r_path, encoding=script['rtable_encoding'])
        result.append(test(script, l_table, r_table, idx, scripts_file_name))
    return result, l_table, r_table
#result là mảng chiều: chiều thứ nhất tương ứng với chỉ số script trong list các script
#chiều thứ hai là dictionary ứng với từng lần đo trong mỗi script 

In [89]:
result, l_table, r_table = load_data_and_test("scripts_0.json")

  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dr

# Compare result with file DBLP-ACM_perfectMapping.csv

In [90]:
true_label_table = pd.read_csv("./dataset/library/DBLP-ACM_perfectMapping.csv", header = 0, encoding="ISO-8859-1", names = ['l_id', 'r_id'])
true_label_table

Unnamed: 0,l_id,r_id
0,conf/sigmod/SlivinskasJS01,375678
1,conf/sigmod/ChaudhuriDN01,375694
2,conf/sigmod/RinfretOO01,375669
3,conf/sigmod/BreunigKKS01,375672
4,conf/sigmod/JagadishJOT01,375687
...,...,...
2219,journals/sigmod/Scholl01,604275
2220,journals/sigmod/Rosneblatt94,190649
2221,journals/sigmod/Winslett02b,601871
2222,journals/sigmod/Labrinidis01,604283


In [91]:
import numpy as np
def caculate_true_positive(joined_table, true_label_table):
    joined_tuples = joined_table[["l_id", "r_id"]].to_records(index = False).astype(dtype=[('l_id', 'O'), ('r_id', 'O')])
    true_tuples = true_label_table[["l_id", "r_id"]].to_records(index = False).astype(dtype=[('l_id', 'O'), ('r_id', 'O')])
    return len(np.intersect1d(joined_tuples, true_tuples))

In [92]:
import copy
temp_result = copy.deepcopy(result)

# Caculate metrics

In [97]:
def cal_metrics(result,true_label_table):
    for result_script in result:    
        for obj_res in result_script:
            #true_label_table: all pairs that is actually match/true
            #obj_res["table"]: all pairs that is predicted match/true
            obj_res["true_positive"] = caculate_true_positive(obj_res["table"], true_label_table)
            obj_res["false_positive"] = len(obj_res["table"]) - obj_res["true_positive"]
            obj_res["false_negative"] = len(true_label_table) - obj_res["true_positive"]
            #base on length of candidate table, not original table
            obj_res["recall"] = obj_res["true_positive"] / (obj_res["true_positive"] + obj_res["false_negative"])
            obj_res["precision"] = obj_res["true_positive"] / (obj_res["true_positive"] + obj_res["false_positive"])
            obj_res["F1"] = 2 * obj_res["precision"] * obj_res["recall"] / (obj_res["precision"] + obj_res["recall"])

In [98]:
caculate_metrics(temp_result, true_label_table)

# Mã hóa result ra file pkl

In [80]:
import pickle
pickle_obj = {'result': result, 'l_table' : l_table, 'r_table':r_table}
pickle.dump(pickle_obj, open("script_0.pkl", "wb"))

# Load file pkl

In [81]:
load_obj = pickle.load(open("script_0.pkl", "rb"))

In [82]:
temp_result = load_obj['result']
#temo_result[i] là script thứ i trong list_script file script_0.json
#temo_result[i][j] là obj ứng vs lần đo j của script thứ i trong list_script file script_0.json
#Nếu script là để test filter, thì mỗi lần đo ứng vs mỗi filter khác nhau.
#obj có nhiều thuộc tính để lấy script, time, accuracy, index lần đo trong script...

In [100]:
for x in range(0, len(temp_result)):
    precision_list = []
    recall_list = []
    f1_list = []
    time_list = []
    num_candidate_list = []
    for y in range(0, len(temp_result[x])):
        precision_list.append(temp_result[x][y]["precision"])
        recall_list.append(temp_result[x][y]["recall"])
        f1_list.append(temp_result[x][y]["F1"])
        time_list.append(temp_result[x][y]["time"])
        num_candidate_list.append(temp_result[x][y]["num_candidate"])
    data_frame = pd.DataFrame({"precision": precision_list, "recall": recall_list, "f1": f1_list, "time": time_list, "num_candidate": num_candidate_list})
    print(data_frame)
    print("\n")


   precision    recall        f1        time  num_candidate
0   0.912351  0.411871  0.567534    5.553507           1388
1   0.912351  0.411871  0.567534   12.397349         900225
2   0.912351  0.411871  0.567534  194.641770           2620
3   0.912351  0.411871  0.567534    1.426078           4392


   precision    recall        f1       time  num_candidate
0   0.909615  0.425360  0.579657  29.948070        2732612
1   0.912745  0.418615  0.573983  19.113021        1799221
2   0.914000  0.410971  0.566998   8.767128         873796


   precision    recall        f1      time  num_candidate
0   0.911005  0.428058  0.582441  0.971348           7203
1   0.912366  0.421313  0.576438  0.891311           2125
2   0.912745  0.418615  0.573983  0.798014           1353


