In [1]:
# Import packages
import py_entitymatching as em
import os
import pandas as pd

# Set the seed value 
seed = 42

In [2]:
# Read inputs
source1 = 'source1_cleaned.csv'
source2 = 'source2_cleaned.csv'
labeled_data = 'candidate_set.csv'

# Read the data
A = em.read_csv_metadata(source1, key = 'ID')
B = em.read_csv_metadata(source2, key = 'ID')

S = em.read_csv_metadata(labeled_data, key='_id', ltable=A, rtable=B, 
                         fk_ltable='ltable_ID', fk_rtable='rtable_ID')

# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [3]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=seed)
svm = em.SVMMatcher(name='SVM', random_state=seed)
rf = em.RFMatcher(name='RF', random_state=seed)
lg = em.LogRegMatcher(name='LogReg', random_state=seed)
ln = em.LinRegMatcher(name='LinReg')

In [4]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [5]:
F.feature_name

0                                           ID_ID_exm
1                                           ID_ID_anm
2                                      ID_ID_lev_dist
3                                       ID_ID_lev_sim
4                           Name_Name_jac_qgm_3_qgm_3
5                       Name_Name_cos_dlm_dc0_dlm_dc0
6                                       Name_Name_mel
7                                  Name_Name_lev_dist
8                                   Name_Name_lev_sim
9                       Author_Author_jac_qgm_3_qgm_3
10                  Author_Author_cos_dlm_dc0_dlm_dc0
11                  Author_Author_jac_dlm_dc0_dlm_dc0
12                                  Author_Author_mel
13                             Author_Author_lev_dist
14                              Author_Author_lev_sim
15                                  Author_Author_nmw
16                                   Author_Author_sw
17                Publisher_Publisher_jac_qgm_3_qgm_3
18            Publisher_Publ

In [6]:
# Drop publishing date, rating related features
F = F.drop([0, 1, 2, 3,25,26,27,28,29,30,35,36,37,38])

In [7]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='match',
                            show_progress=False)

In [8]:
H.head()

Unnamed: 0,_id,ltable_ID,rtable_ID,Name_Name_jac_qgm_3_qgm_3,Name_Name_cos_dlm_dc0_dlm_dc0,Name_Name_mel,Name_Name_lev_dist,Name_Name_lev_sim,Author_Author_jac_qgm_3_qgm_3,Author_Author_cos_dlm_dc0_dlm_dc0,...,Publisher_Publisher_mel,Publisher_Publisher_lev_dist,Publisher_Publisher_lev_sim,Publisher_Publisher_nmw,Publisher_Publisher_sw,Pages_Pages_exm,Pages_Pages_anm,Pages_Pages_lev_dist,Pages_Pages_lev_sim,match
68,68,401,2995,0.565217,0.645497,0.918841,28.0,0.594203,1.0,1.0,...,0.563636,8.0,0.272727,3.0,3.0,1.0,1.0,0.0,1.0,0.0
89,89,547,1743,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.259259,17.0,0.055556,-5.0,1.0,,,,,1.0
12,12,29,2154,0.9375,0.923077,0.960049,1.0,0.989011,0.0,0.0,...,1.0,0.0,1.0,21.0,21.0,1.0,1.0,0.0,1.0,0.0
95,95,595,2851,0.523077,0.5,0.794799,10.0,0.791667,0.033333,0.0,...,1.0,0.0,1.0,9.0,9.0,0.0,0.96875,2.0,0.6,0.0
113,113,687,1834,0.571429,0.845154,0.922222,21.0,0.611111,1.0,1.0,...,0.558834,20.0,0.130435,1.0,3.0,0.0,0.875,2.0,0.6,1.0


In [9]:
# Replace missing values, if any, with mean
H.isnull().sum()

_id                                         0
ltable_ID                                   0
rtable_ID                                   0
Name_Name_jac_qgm_3_qgm_3                   0
Name_Name_cos_dlm_dc0_dlm_dc0               0
Name_Name_mel                               0
Name_Name_lev_dist                          0
Name_Name_lev_sim                           0
Author_Author_jac_qgm_3_qgm_3               1
Author_Author_cos_dlm_dc0_dlm_dc0           1
Author_Author_jac_dlm_dc0_dlm_dc0           1
Author_Author_mel                           1
Author_Author_lev_dist                      1
Author_Author_lev_sim                       1
Author_Author_nmw                           1
Author_Author_sw                            1
Publisher_Publisher_jac_qgm_3_qgm_3         8
Publisher_Publisher_cos_dlm_dc0_dlm_dc0     8
Publisher_Publisher_jac_dlm_dc0_dlm_dc0     8
Publisher_Publisher_mel                     8
Publisher_Publisher_lev_dist                8
Publisher_Publisher_lev_sim       

In [10]:
# Impute missing values
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'match'],
                strategy='mean')

In [11]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'match'],
        k=5,
        target_attr='match', metric_to_select_matcher='f1', random_state=seed)
result['cv_stats']



Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.713074,0.769206,0.729532
1,RF,0.944444,0.720635,0.806775
2,SVM,0.724762,0.791429,0.725
3,LinReg,0.790303,0.778095,0.773494
4,LogReg,0.918095,0.851429,0.878095


In [None]:
# Evaluate on test set 
# TODO : Change this to a separate notebook?
# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='match', show_progress=False)

# Predict on L 
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'match'], 
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)
