In [1]:
# Import packages
import pandas as pd
import os, sys
import ydata_profiling as yp
import py_entitymatching as em

In [2]:
# Parameter
PATH_DIR = os.getcwd()
DATASETS_FOLDER = 'datasets'
REPORT_FOLDER = 'reports'
ORIGINAL_REPORT_FOLDER = 'original'
SAMPLED_REPORT_FOLDER = 'sampled'
file1 = 'acmProfiles.csv'
file2 = 'dblpProfiles.csv'
matched = 'dblpAcmIdDuplicates.csv'

In [3]:
# Define Magellan Pineline
class MagellanPineline:
    def __init__(self, fileNameA, fileNameB, matched, expected) -> None:
        pathdir = PATH_DIR + os.sep + DATASETS_FOLDER
        self.__Acsv__ = pathdir + os.sep + fileNameA
        self.__Bcsv__ = pathdir + os.sep + fileNameB
        self.__Matched__ = pathdir + os.sep + matched
        self.A = pd.DataFrame()
        self.B = pd.DataFrame()
        self.sampleA = pd.DataFrame()
        self.sampleB = pd.DataFrame()
        self.C = pd.DataFrame()
        self.matched = pd.DataFrame()
        self.expected = expected

    def readFile(self):
        self.A = em.read_csv_metadata(self.__Acsv__, key='id')
        self.B = em.read_csv_metadata(self.__Bcsv__, key='id')
#         self.C = em.read_csv_metadata(self.__Matched__)

    def explore_sample(self):
        prtA = yp.ProfileReport(self.A)
        prtA.to_file(PATH_DIR + os.sep + REPORT_FOLDER + os.sep + ORIGINAL_REPORT_FOLDER + os.sep + file1[:-4] + '.html')
        prtB = yp.ProfileReport(self.B)
        prtB.to_file(PATH_DIR + os.sep + REPORT_FOLDER + os.sep + ORIGINAL_REPORT_FOLDER + os.sep + file2[:-4] + '.html')
        
    def down_sampling(self, size, y_param):
        self.sampleA, self.sampleB = em.down_sample(self.A, self.B, size=size, y_param=y_param, show_progress=True)
        
    def block(self):
        ob = em.OverlapBlocker()
        self.C = ob.block_tables(self.sampleA, self.sampleB, 'title', 'title', word_level=True, overlap_size=5, 
                    l_output_attrs=['title', 'year', 'authors','venue'], 
                    r_output_attrs=['title', 'year', 'authors','venue'],
                    show_progress=False)
        return self.C
    
    def labling(self):
        self.matched = em.read_csv_metadata(self.__Matched__, key='id', 
                                            ltable=self.sampleA, rtable=self.sampleB,
                         fk_ltable='entityId1', fk_rtable='entityId2',
                         )
        
        label_dict = {(row['entityId2'], row['entityId1']): row['id'] for _, row in self.matched.iterrows()}
        def get_label(row):
            entityId1 = row['ltable_id']
            entityId2 = row['rtable_id']
            if (entityId1, entityId2) in label_dict:
                return 1
            else:
                return 0
        self.C['label'] = self.C.apply(get_label, axis=1)
        return self.C
    
    def train(self):
        IJ = em.split_train_test(self.C, train_proportion=0.5, random_state=0)
        I = IJ['train']
        J = IJ['test']
        dt = em.DTMatcher(name='DecisionTree', random_state=0)
        svm = em.SVMMatcher(name='SVM', random_state=0)
        rf = em.RFMatcher(name='RF', random_state=0)
        lg = em.LogRegMatcher(name='LogReg', random_state=0)
        ln = em.LinRegMatcher(name='LinReg')
        F = em.get_features_for_matching(self.sampleA, self.sampleB, validate_inferred_attr_types=False)
        F.feature_name

    def execute(self):
        self.readFile()

In [4]:
problem = MagellanPineline(file1, file2, matched, 0.8)

In [5]:
problem.readFile()

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [17]:
problem.down_sampling(1000,1)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [7]:
problem.explore_sample()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
problem.sampleA

Unnamed: 0,id,title,year,authors,venue,entityUrl
0,0,The WASA2 object-oriented workflow management system,1999.0,"Gottfried Vossen, Mathias Weske",International Conference on Management of Data,304586
2049,2049,Composing XSL transformations with XML publishing views,2003.0,"Chengkai Li, Philip Bohannon, P. P. S. Narayan",International Conference on Management of Data,872820
2,2,"World Wide Database-integrating the Web, CORBA and databases",1999.0,"Athman Bouguettaya, Boualem Benatallah, Lily Hendra, James Beard, Kevin Smith, Mourad Quzzani",International Conference on Management of Data,304589
2048,2048,Cache-and-query for wide area sensor databases,2003.0,"Amol Deshpande, Suman Nath, Phillip B. Gibbons, Srinivasan Seshan",International Conference on Management of Data,872818
4,4,The CCUBE constraint object-oriented database system,1999.0,"Alexander Brodsky, Victor E. Segal, Jia Chen, Paval A. Exarkhopoulo",International Conference on Management of Data,304582
...,...,...,...,...,...,...
2041,2041,Formal semantics and analysis of object queries,2003.0,G. M. Bierman,International Conference on Management of Data,872807
2042,2042,A theory of redo recovery,2003.0,"David Lomet, Mark Tuttle",International Conference on Management of Data,872806
2043,2043,A characterization of the sensitivity of query optimization to storage access cost parameters,2003.0,"Frederick R. Reiss, Tapas Kanungo",International Conference on Management of Data,872804
2046,2046,Scientific data repositories: designing for a moving target,2003.0,"Etzard Stolte, Christoph von Praun, Gustavo Alonso, Thomas Gross",International Conference on Management of Data,872800


In [9]:
problem.sampleB

Unnamed: 0,id,venue,year,authors,title,entityUrl
2221,2221,SIGMOD Conference,1996,"Donald Kossmann, Michael J. Franklin, Bj�rn ��r J�nsson",Performance Tradeoffs for Client-Server Query Processing,conf/sigmod/FranklinJK96
1536,1536,VLDB,2000,Michael J. Carey,"Toto, We're Not in Kansas Anymore: On Transitioning from Research to the Real (Invited Industria...",conf/vldb/Carey00
1214,1214,SIGMOD Conference,1995,"Nelson Mendon�a Mattos, Jim Melton",An Overview of the Emerging Third-Generation SQL Standard (Tutorial),conf/sigmod/MattosM95
961,961,VLDB,1998,"Mitch Cherniack, Stanley B. Zdonik",Inferring Function Semantics to Optimize Queries,conf/vldb/CherniackZ98
2357,2357,SIGMOD Conference,1998,"V. S. Subrahmanian, Sibel Adali, Maria Luisa Sapino, Piero A. Bonatti",A Multi-Similarity Algebra,conf/sigmod/AdaliBSS98
...,...,...,...,...,...,...
792,792,SIGMOD Conference,2000,"Junghoo Cho, Hector Garcia-Molina",Synchronizing a Database to Improve Freshness,conf/sigmod/ChoG00
844,844,VLDB,1998,"Simonas Saltenis, Christian S. Jensen, Giedrius Slivinskas, Rasa Bliujute",R-Tree Based Indexing of Now-Relative Bitemporal Data,conf/vldb/BliujuteJSS98
1571,1571,SIGMOD Record,1999,"Klaus R. Dittrich, Ruxandra Domenig",An Overview and Classification of Mediated Query Systems,journals/sigmod/DomenigD99
678,678,SIGMOD Conference,1994,Michael Ubell,The Montage Extensible DataBlade Achitecture,conf/sigmod/Ubell94


In [18]:
problem.block()

  projected_dataframe = dataframe[proj_attrs].dropna(0,
  projected_dataframe = dataframe[proj_attrs].dropna(0,


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_authors,ltable_venue,rtable_title,rtable_year,rtable_authors,rtable_venue
0,0,390,1697,Designing an ultra highly available DBMS (tutorial session),2000.0,"Svein Erik Bratsberg, &#216;ystein Torbj&#248;rnsen",International Conference on Management of Data,Tutorial: Designing an Ultra Highly Available DBMS,2000,"�ystein Torbj�rnsen, Svein Erik Bratsberg",SIGMOD Conference
1,1,1739,1095,Energy efficient indexing on air,1994.0,"Tomasz Imielinski, S. Viswanathan, B. R. Badrinath",International Conference on Management of Data,Energy Efficient Indexing on Air,1994,"S. Viswanathan, B. R. Badrinath, Tomasz Imielinski",SIGMOD Conference
2,2,1516,1910,Adapting materialized views after redefinitions,1995.0,"Ashish Gupta, Inderpal S. Mumick, Kenneth A. Ross",International Conference on Management of Data,Adapting Materialized Views after Redefinitions,1995,"Ashish Gupta, Inderpal Singh Mumick, Kenneth A. Ross",SIGMOD Conference
3,3,902,427,A graphical query language for mobile information systems,2003.0,Ya-Hui Chang,ACM SIGMOD Record,A Graphical Query Language for Mobile Information Systems,2003,Ya-Hui Chang,SIGMOD Record
4,4,1318,1738,VXMLR: A Visual XML-Relational Database System,2001.0,"Aoying Zhou, Hongjun Lu, Shihui Zheng, Yuqi Liang, Long Zhang, Wenyun Ji, Zengping Tian",Very Large Data Bases,VXMLR: A Visual XML-Relational Database System,2001,"Aoying Zhou, Wenyun Ji, Shihui Zheng, Hongjun Lu, Yuqi Liang, Zengping Tian, Long Zhang",VLDB
...,...,...,...,...,...,...,...,...,...,...,...
991,991,3,463,XML-based information mediation with MIX,1999.0,"Chaitan Baru, Amarnath Gupta, Bertram Lud&#228;scher, Richard Marciano, Yannis Papakonstantinou,...",International Conference on Management of Data,XML-Based Information Mediation with MIX,1999,"Yannis Papakonstantinou, Richard Marciano, Chaitanya K. Baru, Pavel Velikhov, Vincent Chu, Amarn...",SIGMOD Conference
992,992,237,764,Enhanced nearest neighbour search on the R-tree,1998.0,"King Lum Cheung, Ada Wai-Chee Fu",ACM SIGMOD Record,Enhanced Nearest Neighbour Search on the R-tree,1998,"Ada Wai-Chee Fu, King Lum Cheung",SIGMOD Record
993,993,811,1187,Relational Databases for Querying XML Documents: Limitations and Opportunities,1999.0,"Jayavel Shanmugasundaram, Kristin Tufte, Chun Zhang, Gang He, David J. DeWitt, Jeffrey F. Naughton",Very Large Data Bases,Relational Databases for Querying XML Documents: Limitations and Opportunities,1999,"Jeffrey F. Naughton, Kristin Tufte, Chun Zhang, Gang He, David J. DeWitt, Jayavel Shanmugasundaram",VLDB
994,994,937,1162,"Don't Scrap It, Wrap It! A Wrapper Architecture for Legacy Data Sources",1997.0,"Mary Tork Roth, Peter M. Schwarz",Very Large Data Bases,"Don't Scrap It, Wrap It! A Wrapper Architecture for Legacy Data Sources",1997,"Peter M. Schwarz, Mary Tork Roth",VLDB


In [20]:
problem.labling()

Metadata file is not present in the given path; proceeding to read the csv file.


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_year,ltable_authors,ltable_venue,rtable_title,rtable_year,rtable_authors,rtable_venue,label
0,0,390,1697,Designing an ultra highly available DBMS (tutorial session),2000.0,"Svein Erik Bratsberg, &#216;ystein Torbj&#248;rnsen",International Conference on Management of Data,Tutorial: Designing an Ultra Highly Available DBMS,2000,"�ystein Torbj�rnsen, Svein Erik Bratsberg",SIGMOD Conference,1
1,1,1739,1095,Energy efficient indexing on air,1994.0,"Tomasz Imielinski, S. Viswanathan, B. R. Badrinath",International Conference on Management of Data,Energy Efficient Indexing on Air,1994,"S. Viswanathan, B. R. Badrinath, Tomasz Imielinski",SIGMOD Conference,1
2,2,1516,1910,Adapting materialized views after redefinitions,1995.0,"Ashish Gupta, Inderpal S. Mumick, Kenneth A. Ross",International Conference on Management of Data,Adapting Materialized Views after Redefinitions,1995,"Ashish Gupta, Inderpal Singh Mumick, Kenneth A. Ross",SIGMOD Conference,1
3,3,902,427,A graphical query language for mobile information systems,2003.0,Ya-Hui Chang,ACM SIGMOD Record,A Graphical Query Language for Mobile Information Systems,2003,Ya-Hui Chang,SIGMOD Record,1
4,4,1318,1738,VXMLR: A Visual XML-Relational Database System,2001.0,"Aoying Zhou, Hongjun Lu, Shihui Zheng, Yuqi Liang, Long Zhang, Wenyun Ji, Zengping Tian",Very Large Data Bases,VXMLR: A Visual XML-Relational Database System,2001,"Aoying Zhou, Wenyun Ji, Shihui Zheng, Hongjun Lu, Yuqi Liang, Zengping Tian, Long Zhang",VLDB,1
...,...,...,...,...,...,...,...,...,...,...,...,...
991,991,3,463,XML-based information mediation with MIX,1999.0,"Chaitan Baru, Amarnath Gupta, Bertram Lud&#228;scher, Richard Marciano, Yannis Papakonstantinou,...",International Conference on Management of Data,XML-Based Information Mediation with MIX,1999,"Yannis Papakonstantinou, Richard Marciano, Chaitanya K. Baru, Pavel Velikhov, Vincent Chu, Amarn...",SIGMOD Conference,1
992,992,237,764,Enhanced nearest neighbour search on the R-tree,1998.0,"King Lum Cheung, Ada Wai-Chee Fu",ACM SIGMOD Record,Enhanced Nearest Neighbour Search on the R-tree,1998,"Ada Wai-Chee Fu, King Lum Cheung",SIGMOD Record,1
993,993,811,1187,Relational Databases for Querying XML Documents: Limitations and Opportunities,1999.0,"Jayavel Shanmugasundaram, Kristin Tufte, Chun Zhang, Gang He, David J. DeWitt, Jeffrey F. Naughton",Very Large Data Bases,Relational Databases for Querying XML Documents: Limitations and Opportunities,1999,"Jeffrey F. Naughton, Kristin Tufte, Chun Zhang, Gang He, David J. DeWitt, Jayavel Shanmugasundaram",VLDB,1
994,994,937,1162,"Don't Scrap It, Wrap It! A Wrapper Architecture for Legacy Data Sources",1997.0,"Mary Tork Roth, Peter M. Schwarz",Very Large Data Bases,"Don't Scrap It, Wrap It! A Wrapper Architecture for Legacy Data Sources",1997,"Peter M. Schwarz, Mary Tork Roth",VLDB,1


In [22]:
IJ = em.split_train_test(problem.C, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
F = em.get_features_for_matching(problem.sampleA, problem.sampleB, validate_inferred_attr_types=False)
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [23]:
F.feature_name


0                               id_id_exm
1                               id_id_anm
2                          id_id_lev_dist
3                           id_id_lev_sim
4             title_title_jac_qgm_3_qgm_3
5         title_title_cos_dlm_dc0_dlm_dc0
6                         title_title_mel
7                    title_title_lev_dist
8                     title_title_lev_sim
9                           year_year_exm
10                          year_year_anm
11                     year_year_lev_dist
12                      year_year_lev_sim
13        authors_authors_jac_qgm_3_qgm_3
14    authors_authors_cos_dlm_dc0_dlm_dc0
15                    authors_authors_mel
16               authors_authors_lev_dist
17                authors_authors_lev_sim
Name: feature_name, dtype: object

In [153]:
H.head()


Unnamed: 0,_id,ltable_id,rtable_id,id_id_exm,id_id_anm,id_id_lev_dist,id_id_lev_sim,title_title_jac_qgm_3_qgm_3,title_title_cos_dlm_dc0_dlm_dc0,title_title_mel,...,year_year_exm,year_year_anm,year_year_lev_dist,year_year_lev_sim,authors_authors_jac_qgm_3_qgm_3,authors_authors_cos_dlm_dc0_dlm_dc0,authors_authors_mel,authors_authors_lev_dist,authors_authors_lev_sim,label
958,958,1527,2270,0,0.672687,3,0.25,0.564516,0.285714,0.886018,...,1.0,1.0,2.0,0.666667,0.636364,0.6,0.766013,26.0,0.235294,1
401,401,1156,854,0,0.738754,3,0.25,0.506849,0.25,0.903653,...,1.0,1.0,2.0,0.666667,1.0,1.0,1.0,0.0,1.0,1
799,799,2152,372,0,0.172862,3,0.25,0.428571,0.629941,0.701343,...,0.0,0.998003,6.0,0.0,0.007752,0.0,0.487677,106.0,0.086207,0
654,654,2083,916,0,0.43975,4,0.0,0.586207,0.2,0.898636,...,1.0,1.0,2.0,0.666667,0.811594,0.617213,0.841587,47.0,0.241935,1
441,441,2058,199,0,0.096696,4,0.0,0.222222,0.555556,0.697577,...,0.0,0.999501,3.0,0.5,0.0,0.0,0.474221,70.0,0.113924,0


In [154]:
any(pd.notnull(H))


True

In [24]:
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')

  imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans


In [156]:
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.971724,0.972589,0.971994
1,RF,0.977804,0.991342,0.984422
2,SVM,0.945944,0.975186,0.960146
3,LinReg,0.957269,1.0,0.978099
4,LogReg,0.970279,0.993939,0.981842


In [26]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
dt.fit(table=H, 
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

L = em.impute_table(L, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')
# Predict on L 
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False, return_probs=True, show_progress=True,
                        probs_attr='proba')

  imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans


TypeError: MLMatcher.predict() got an unexpected keyword argument 'show_progress'