# Import thư viện

In [1]:
# # Import libraries
# !pip install py_stringsimjoin
# !pip install py_stringmatching
import py_stringsimjoin as ssj
import py_stringmatching as sm
import pandas as pd
import os, sys, time

In [2]:
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
import pandas as pd

from py_stringsimjoin.join.cosine_join import cosine_join
from py_stringsimjoin.join.dice_join import dice_join
from py_stringsimjoin.join.edit_distance_join import edit_distance_join
from py_stringsimjoin.join.jaccard_join import jaccard_join
from py_stringsimjoin.join.overlap_coefficient_join import overlap_coefficient_join
from py_stringsimjoin.join.overlap_join import overlap_join

# Download dataset

In [1]:
!pip install gdown
!gdown https://github.com/anhaidgroup/py_stringsimjoin/raw/master/benchmarks/example_datasets.tar.gz

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1


Downloading...
From: https://github.com/anhaidgroup/py_stringsimjoin/raw/master/benchmarks/example_datasets.tar.gz
To: d:\BTL DM\Project\example_datasets.tar.gz

  0%|          | 0.00/15.8M [00:00<?, ?B/s]
  3%|▎         | 524k/15.8M [00:00<00:08, 1.71MB/s]
  7%|▋         | 1.05M/15.8M [00:00<00:09, 1.50MB/s]
 10%|▉         | 1.57M/15.8M [00:00<00:08, 1.61MB/s]
 13%|█▎        | 2.10M/15.8M [00:01<00:08, 1.63MB/s]
 17%|█▋        | 2.62M/15.8M [00:01<00:06, 1.88MB/s]
 20%|█▉        | 3.15M/15.8M [00:01<00:06, 1.93MB/s]
 23%|██▎       | 3.67M/15.8M [00:02<00:06, 1.87MB/s]
 27%|██▋       | 4.19M/15.8M [00:02<00:06, 1.80MB/s]
 30%|██▉       | 4.72M/15.8M [00:02<00:06, 1.69MB/s]
 33%|███▎      | 5.24M/15.8M [00:02<00:05, 1.85MB/s]
 37%|███▋      | 5.77M/15.8M [00:03<00:05, 1.67MB/s]
 40%|███▉      | 6.29M/15.8M [00:03<00:05, 1.79MB/s]
 43%|████▎     | 6.82M/15.8M [00:03<00:05, 1.66MB/s]
 47%|████▋     | 7.34M/15.8M [00:04<00:06, 1.41MB/s]
 50%|████▉     | 7.86M/15.8M [00:04<00:04, 1.67MB/s]


In [4]:
import tarfile
# open file
file = tarfile.open('example_datasets.tar.gz')
# extracting file
file.extractall('./example_datasets')
file.close()

In [6]:
scenarios_list = [
        {
            "dataset_name": "anime",
            "ltable": ["anime", "A.csv"],
            "rtable": ["anime", "B.csv"],
            "ltable_encoding": "utf-8",
            "rtable_encoding": "utf-8",
            "l_id_attr": "ID",
            "r_id_attr": "ID",
            "l_join_attr": "Title",
            "r_join_attr": "Title",
            "tokenizers": ["2_GRAM"],                                  
            "sim_measure_types": ["JACCARD"],
            "thresholds": [0.7],
            "n_jobs": [1],
            "scale_filters": ["POSITION_FILTER", "OVERLAP_FILTER", "PREFIX_FILTER", "SIZE_FILTER", "SUFFIX_FILTER"],
            "sim_funcs": ["JACCARD"]
    }]

# Define measure and tokenizer

In [7]:
JOIN_FUNCTIONS = {'COSINE': cosine_join,
                  'DICE': dice_join,
                  'EDIT_DISTANCE': edit_distance_join,
                  'JACCARD': jaccard_join,
                  'OVERLAP': overlap_join,
                  'OVERLAP_COEFFICIENT': overlap_coefficient_join}

TOKENIZERS = {'SPACE_DELIMITER': DelimiterTokenizer(delim_set=[' '],
                                                    return_set=True),
              '2_GRAM': QgramTokenizer(qval=2, padding=False, return_set=True),
              '3_GRAM': QgramTokenizer(qval=3, padding=False, return_set=True),
              '2_GRAM_BAG': QgramTokenizer(qval=2),
              '3_GRAM_BAG': QgramTokenizer(qval=3)
            }

# Define filters

In [8]:
FILTERS = {
    "OVERLAP_FILTER": ssj.OverlapFilter,
    "SIZE_FILTER": ssj.SizeFilter,
    "PREFIX_FILTER": ssj.PrefixFilter,
    "POSITION_FILTER": ssj.PositionFilter,
    "SUFFIX_FILTER": ssj.SuffixFilter
}

# Define Sim_function

In [9]:
SIM_FUNC = {
    'COSINE': sm.Cosine().get_sim_score,
    'DICE': sm.Dice().get_sim_score,
    'JACCARD': sm.Jaccard().get_sim_score,
    'OVERLAP_COEFFICIENT': sm.OverlapCoefficient().get_sim_score,
    'LEVENSHTEIN': sm.Levenshtein().get_sim_score,
    'TF-IDF': sm.TfIdf,
}

# Set directory

In [10]:
# path where datasets are present                 
BASE_PATH = os.sep.join([os.getcwd(), 'example_datasets', 'example_datasets'])

# join scenarios json file. If you need to perform benchmark on a new dataset,
# add a entry for that dataset in the json file.
JOIN_SCENARIOS_FILE = 'join_scenarios.json'

# datasets that need to be skipped from benchmarking
EXCLUDE_DATASETS = []

# number of times to run each benchmark
NUMBER_OF_EXECUTIONS = 1

# benchmark output directory
OUTPUT_DIR = '_benchmark_results'

In [11]:
BASE_PATH

'/work/example_datasets/example_datasets'

# Class JoinScenario

In [12]:
class JoinScenario:
    def __init__(self, dataset_name, ltable, rtable,
                 ltable_encoding, rtable_encoding, l_id_attr, r_id_attr,
                 l_join_attr, r_join_attr, tokenizers,
                 sim_measure_types, thresholds, n_jobs, scale_filters, sim_funcs):
        self.dataset_name = dataset_name 
        self.ltable = os.sep.join(ltable)
        self.rtable = os.sep.join(rtable)
        self.ltable_encoding = ltable_encoding
        self.rtable_encoding = rtable_encoding
        self.l_id_attr = l_id_attr
        self.r_id_attr = r_id_attr
        self.l_join_attr = l_join_attr
        self.r_join_attr = r_join_attr
        self.tokenizers = tokenizers
        self.sim_measure_types = sim_measure_types
        self.thresholds = thresholds
        self.n_jobs = n_jobs
        self.scale_filters = scale_filters
        self.sim_funcs = sim_funcs

In [13]:
import json
def load_join_scenarios():

    #!!!!!!!!!!!!!!!
    # fp = open(JOIN_SCENARIOS_FILE, 'r')
    # scenarios = json.load(fp)['scenarios']
    # fp.close()
    scenarios = scenarios_list

    join_scenarios = []
    for sc in scenarios:
        join_scenario = JoinScenario(sc['dataset_name'], 
                                     sc['ltable'], sc['rtable'],
                                     sc['ltable_encoding'], sc['rtable_encoding'], 
                                     sc['l_id_attr'], sc['r_id_attr'],
                                     sc['l_join_attr'], sc['r_join_attr'],
                                     sc['tokenizers'], sc['sim_measure_types'],
                                     sc['thresholds'], sc['n_jobs'],
                                     sc['scale_filters'],
                                     sc['sim_funcs']
                                     )
        join_scenarios.append(join_scenario)
    return join_scenarios

In [14]:
def load_data_and_test():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    # load scenarios
    scenarios = load_join_scenarios()
    output_header = ','.join(['left join attr', 'right join attr', 
                              'similarity measure type', 'tokenizer',
                        'threshold', 'n_jobs', 'candset size', 'avg time']) 
    for scenario in scenarios:
        if scenario.dataset_name in EXCLUDE_DATASETS: continue
        ltable_path = os.sep.join([BASE_PATH, scenario.ltable])
        rtable_path = os.sep.join([BASE_PATH, scenario.rtable])
        out_file_path = os.sep.join([OUTPUT_DIR, scenario.dataset_name])
        add_header = not os.path.exists(out_file_path)
        output_file = open(out_file_path, 'a')        
        if add_header:
            output_file.write('%s\n' % output_header)
        # load input tables for the scenario
        ltable = pd.read_csv(ltable_path, encoding=scenario.ltable_encoding)
        rtable = pd.read_csv(rtable_path, encoding=scenario.rtable_encoding)

        test(scenario, output_file, ltable, rtable)
    
    output_file.close()

In [15]:
from itertools import product
def test(scenario, output_file, ltable, rtable):
    total_info_obj = product(
        scenario.sim_funcs,
        scenario.sim_measure_types, 
        scenario.tokenizers,
        scenario.scale_filters,
        scenario.thresholds,
        scenario.n_jobs
        )
    for sim_funcs, sim_measure_type, tokenizer, sclale_filter, threshold, n_jobs in total_info_obj:
        if tokenizer in ["SPACE_DELIMITER"] and sim_measure_type != 'EDIT_DISTANCE': continue

        sim_func = SIM_FUNC[sim_funcs]
        # tok = TOKENIZERS['tokenizer']
        # sclale_filter = "SIZE_FILTER"
        # join_fn = JOIN_FUNCTIONS[sim_measure_type]
        tok = TOKENIZERS['3_GRAM']
        if sclale_filter == "OVERLAP_FILTER":
            s_filter = FILTERS[sclale_filter](tok, overlap_size=1, comp_op='>=', allow_missing=False)
        else: 
            s_filter = FILTERS[sclale_filter](tok, sim_measure_type, threshold, \
                    allow_empty=True, allow_missing=False)
        
        if sim_measure_type ==  'EDIT_DISTANCE':
            args = (threshold, '<=', False, None, None, 'l_', 'r_', True,
                                    n_jobs, tok)
        elif sim_measure_type == 'OVERLAP':
            args = (tok, threshold, '>=', False, None, None, 'l_', 'r_',
                                    True, n_jobs)     
        else:
            args = (tok, threshold, '>=', True, False, None, None, 'l_', 'r_',
                                    True, n_jobs)
        print(tokenizer)
        ####Caculate time
        cumulative_time = 0
        candset_size = 0
        for i in range(NUMBER_OF_EXECUTIONS):
            start_time = time.time()
            ###Trước khi join phải apply filter, Apply chỗ này thay join fn bằng apply_matcher
            candidate_set = s_filter.filter_tables(
                ltable, rtable, 
                scenario.l_id_attr, scenario.r_id_attr,
                scenario.l_join_attr, scenario.r_join_attr,
                l_out_attrs=None, r_out_attrs=None, 
                l_out_prefix='l_', r_out_prefix='r_', 
                n_jobs=n_jobs, show_progress=True)
            C = ssj.apply_matcher(candidate_set, 
                'l_'+scenario.l_id_attr, 'r_'+scenario.r_id_attr, \
                ltable, rtable, 
                scenario.l_id_attr, scenario.r_id_attr, 
                scenario.l_join_attr, scenario.r_join_attr, 
                tokenizer = tok, 
                sim_function = sim_func, 
                threshold = threshold,
                comp_op='>=', allow_missing=False, 
                l_out_attrs=[scenario.l_join_attr], r_out_attrs=[scenario.r_join_attr], 
                l_out_prefix='l_', r_out_prefix='r_', 
                out_sim_score=True, n_jobs=n_jobs, show_progress=True) 
            cumulative_time += (time.time() - start_time)
            candset_size = len(C)
            avg_time_elapsed = float(cumulative_time) / float(NUMBER_OF_EXECUTIONS)      
            output_record = ','.join([str(scenario.l_join_attr), str(scenario.r_join_attr), 
                                                  str(sim_measure_type), str(tokenizer),
                                                  str(threshold), str(n_jobs),
                                                  str(candset_size), str(avg_time_elapsed)])
            print(C[["_id", 'l_'+scenario.l_join_attr, 'r_'+scenario.r_join_attr, "_sim_score"]])
            output_file.write('%s\n' % output_record)
                
   


In [16]:
load_data_and_test()

2_GRAM
        _id                                                l_Title  \
0         6                Hidamari Sketch: Sae Hiro Sotsugyou-hen   
1         9                             To Aru Kagaku no Railgun S   
2        12                                      Tamako Love Story   
3        15                                             Working!!!   
4        16                                              Working!!   
...     ...                                                    ...   
2721  11832  Wellber no Monogatari: Sisters of Wellber Dai ni Maku   
2722  11857                                          Dragon Ball Z   
2723  11858                                            Dragon Ball   
2724  11866                         Hentai Ouji to Warawanai Neko.   
2725  11896                            Otome wa Boku ni Koishiteru   

                                          r_Title  _sim_score  
0         Hidamari Sketch: Sae Hiro Sotsugyou-hen    1.000000  
1                       

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c080558f-d0ef-4c01-aae2-20b2147b87a6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>