In [None]:
import py_entitymatching as em
import pandas as pd
import os
from utils.utils import *

pd.options.mode.chained_assignment = None  # default='warn''

<h3>0. Prelude.</h3>
This notebook is for the pipeline which includes EM's blocking and sampling step. <br>
In this notebook, we instantiate two tables and load them into Magellans workframe. <br>
Thereafter we introduce blockers. Some blockers might be more prevelant than others. <br>
Lastly, we save the candidate set to be labelled manually by the developer. <br>
<br>

In [None]:
# Set variables for this pipeline...
datasets_dir = r'C:\Users\aleks\Desktop\Master Thesis\Py_Magellan\DataSets\movies1'
name_of_table_A = "imdb2.csv"
name_of_table_B = "rotten_tomatoes2.csv"
name_of_sample_set = "sample_set.csv"
sample_set_size = 450


<h3>1. Load datasets.</h3>
First, load two datasets from our dataset path. <br>
We also include an ID generator for each line, as some of the datasets come without indices.

In [None]:
# Get the path of the input tables
path_A = datasets_dir + os.sep + name_of_table_A
path_B = datasets_dir + os.sep + name_of_table_B

# If the datasets do not contain a numerical ID, we create it.
create_index_as_id(path_A)
create_index_as_id(path_B)

# We read in the tables data and set the ID column as keys.
A_meta = em.read_csv_metadata(path_A, key='ID')
B_meta = em.read_csv_metadata(path_B,  key='ID')

# To be sure, we set it twice. We see `ID` is the key attribute (since it contains unique values and no value is missing) for the table. We can set this metadata as follows:
em.set_key(A_meta, 'ID')
em.set_key(B_meta, 'ID')

<h3>2. Down-sizing.</h3>
Incase of the datasets being too large, we downsample the datasets before a production run.
This can be commented out for production stage.

In [None]:
# A_meta, B_meta = em.down_sample(A_meta, B_meta, size=1000, y_param=1, show_progress=False)

<h3>3. Blockers!</h3>
Furthermore, we do blockers to signify which entities do not match by absolution.<br>
We will do: <br>
Name - Rule-Based Blocker <br>
Genre - Overlap Blocker <br>
Year R - BlackBox Blocker <br>
Director - Attribute Blocker <br>
<br><br>
To start off, we assure that the attributes we are interested in comparing on are of the same data type. <br> <br>

(In hindsight, if we use a Deep Learning method for our Matcher, most blocking steps become redundant.)

In [None]:
feature_table  = em.get_features_for_blocking(A_meta, B_meta, validate_inferred_attr_types=False)

In [None]:
print(em._atypes1['Name'], em._atypes1['Genre'], em._atypes1['Year'], em._atypes1['Director'])
print(em._atypes2['Name'], em._atypes2['Genre'], em._atypes2['Year'], em._atypes2['Director'])
em._atypes1['Genre'] = 'str_bt_1w_5w'
em._atypes2['Genre'] = 'str_bt_1w_5w'
em._atypes1['Year'] = 'str_bt_1w_5w'
em._atypes2['Year'] = 'str_bt_1w_5w'
print(em._atypes1['Name'], em._atypes1['Genre'], em._atypes1['Year'], em._atypes1['Director'])
print(em._atypes2['Name'], em._atypes2['Genre'], em._atypes2['Year'], em._atypes2['Director'])

# Then we use the command to see which columns are comparable.
print(em._block_c['corres'])

<h4>3.1 Rule-Based blocker on "Name"</h4>

In [None]:
# rb = em.RuleBasedBlocker()
# rb.add_rule(['Name_Name_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
# canditate_set_RB = rb.block_tables(A_meta, B_meta, 
#                     l_output_attrs=['Name', 'Genre', 'Year', 'Director'],
#                     r_output_attrs=['Name', 'Genre', 'Year', 'Director'],
#                     show_progress=False)


# NOTE: Time: >1h for full dataset. 1m 50m for 1000. So we only just an Overlap blocker for now.

ob = em.OverlapBlocker()
canditate_set_RB = ob.block_tables(A_meta, B_meta, 'Name', 'Name', word_level=True, overlap_size=3, 
                    l_output_attrs=['Name', 'Genre', 'ReleaseDate', 'Director', 'Creator'], 
                    r_output_attrs=['Name', 'Genre', 'ReleaseDate', 'Director', 'Creator'],
                    show_progress=False)


<h4>3.2 Overlap blocker on "Genre"</h4>

In [None]:
ob = em.OverlapBlocker()
canditate_set_OB = ob.block_candset(canditate_set_RB, 'Genre', 'Genre', word_level=True, overlap_size=2, show_progress=False)

It seems perfectly fine to stop here, but we add more blockers for show...

<h4>3.3 Blackbox blocker on "Year"</h4>

In [None]:
bb = em.BlackBoxBlocker()
bb.set_black_box_function(is_year_year)
canditate_set_BB = bb.block_candset(canditate_set_RB)

<h4>3.4 Attribute blocker on "Director"</h4>

In [None]:
ab = em.AttrEquivalenceBlocker()
canditate_set_AB = ab.block_candset(canditate_set_BB, l_block_attr='Director', r_block_attr='Director')

<h4>4. Sample data to be labelled</h4>
For our next step, we need to sample some data and label it accordingly for our matcher. <br>
Sample set amount can vary, but these are to be our ground-truth forward. <br> <br> 
As of 2022, Magellan does not support proper GUI for labelling, so we save the sample set to a filepath and label it manually.  <br>
Then, we load the dataset from its origin path.

In [None]:
# NOTE: Choose the candidate which is the result of your recent blocker. 
sample_set = em.sample_table(canditate_set_BB, sample_set_size)
sample_set.to_csv(datasets_dir + os.sep + name_of_sample_set)