# Notebook for analyzing benchmark tasks results

Contains the following functionalities:

1. Gets identifying attributes [1] per task (Benchmark tasks located in benchmark_tasks.gzip --> Please unzip before using). 
2. Produces results table
3. Calculates profiling dimensions per task

[1] Primpeli, Anna, and Christian Bizer. "Profiling entity matching benchmark tasks." Proceedings of the 29th ACM International Conference on Information & Knowledge Management. 2020.

In [None]:
import seaborn as sns;
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate,cross_val_predict, StratifiedShuffleSplit, GridSearchCV, PredefinedSplit
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from os.path import isfile, join
from os import listdir
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from copy import copy
import statistics
import networkx as nx
import similaritymeasures


# 1. Get ID Features/ Attributes

In [None]:
#get identifying features
selected_task = "lspc_computers"
fv_vector_train=pd.read_csv('../benchmark_tasks/'+selected_task+'/train_pairs_fv.csv')
fv_vector_test=pd.read_csv('../benchmark_tasks/'+selected_task+'/test_pairs_fv.csv')
fv_vector = pd.concat([fv_vector_train,fv_vector_test])

def getIDFeatures(fv_vector):

        X = fv_vector.drop(['source_id', 'target_id', 'pair_id', 'label','source','target','agg_score','unsupervised_label'], axis=1)
        y =  fv_vector['label'].values
        clf = RandomForestClassifier(random_state=1)
        model = clf.fit(X,y)     
        features_in_order, feature_weights = showFeatureImportances(X.columns.values,model,'rf',display=False) 
        
        # all features that are relevant for the matching
        matching_relevant_features = []
        xval_scoring = {'precision' : make_scorer(precision_score),'recall' : make_scorer(recall_score),'f1_score' : make_scorer(f1_score)}     

        max_result = cross_validate(clf, X, y, cv=StratifiedShuffleSplit(n_splits=4,random_state =1),
                                    scoring=xval_scoring, n_jobs=-1)

        max_f1_score = round(np.mean(max_result['test_f1_score']),2)
        #gather features that are relevant for 95% of the max f1 score
        sub_result = 0.0
        for i in range(1,len(features_in_order)+1):
            results_subvector = cross_validate(clf, X[features_in_order[:i]], y, cv=StratifiedShuffleSplit(n_splits=4,random_state =1),  scoring=xval_scoring, n_jobs=-1)
            sub_result = round(np.mean(results_subvector ['test_f1_score']),2)
            if (sub_result>0.95*max_f1_score): break;
           
        
        important_features = features_in_order[:i]
        
        print(important_features)
        
def showFeatureImportances(column_names, model, classifierName,display=True):
      
    importances = get_model_importances(model, classifierName)
       
    column_names = [c.replace('<http://schema.org/Product/', '').replace('>','') for c in column_names]
    sorted_zipped = sorted(list(zip(column_names, importances)), key = lambda x: x[1], reverse=True)[:50]
   
    features_in_order = [val[0] for val in sorted_zipped]
    feature_weights_in_order = [round(val[1],2) for val in sorted_zipped]
    if (display):
        plt.figure(figsize=(18,3))
        plt.title('Feature importances for classifier %s (max. top 50 features)' % classifierName)
        plt.bar(range(len(sorted_zipped)), [val[1] for val in sorted_zipped], align='center', width = 0.8)
        plt.xticks(range(len(sorted_zipped)), [val[0] for val in sorted_zipped])
        plt.xticks(rotation=90)
        plt.show() 

    return features_in_order,feature_weights_in_order

def get_model_importances(model,classifierName=None):
 
    if classifierName == 'logr':
        importances = model.coef_.ravel()
    elif classifierName == 'svm':
        if model.kernel != 'linear':
            display("Cannot print feature importances without a linear kernel")
            return
        else: importances = model.coef_.ravel()
    else:
        importances = model.feature_importances_
    
    return importances

getIDFeatures(fv_vector)

# 2. Produce results table

In [None]:
#BENCHMARK TASKS PROFILE AND COMPARISON
main_path = "../benchmark_tasks/"


#ignore first for auc calculation
ignore_first=50

def getAUC(f1_scores_all, ignore_first=20):
    f1_scores = f1_scores_all[ignore_first:]
    x_ax_data= np.zeros((len(f1_scores),2))
    x_ax_data[:, 0] = [0]*len(f1_scores)
    x_ax_data[:, 1] = np.arange(0,len(f1_scores))

    f1_scores_data= np.zeros((len(f1_scores),2))
    f1_scores_data[:, 0] = f1_scores
    f1_scores_data[:, 1] = np.arange(0,len(f1_scores))

    area, d = similaritymeasures.dtw(f1_scores_data, x_ax_data)
    return area

#get all results, calculate AUC and plot scatter plots based on a certain dimension
all_results =pd.DataFrame(columns=['method','task','AUC', 
                                   'F1_85','F1_150', 'F1_last_it', 'STD_last_it'])

result_type = 'micro' 
methods = ['HeALER','ALMSER', 'ALMSERgroup'] 

finished_settings =0

for setting_path in listdir(main_path):
   
    for res_file in listdir(main_path+'/'+setting_path+"/results/"):
        if res_file == "ALMSERgroup.csv":
            file_almser_group=main_path+'/'+setting_path+"/results/"+res_file
        elif res_file == "ALMSER.csv":
            file_almser=main_path+'/'+setting_path+"/results/"+res_file
        elif res_file == "HeALER.csv":
            file_healer=main_path+'/'+setting_path+"/results/"+res_file
       
    
    results_almser_group = pd.read_csv(file_almser_group)
    results_almser= pd.read_csv(file_almser)
    results_healer= pd.read_csv(file_healer)

    
    auc_results = dict()
    f1_last_it = dict()
    f1_at_85 = dict()
    f1_at_150 = dict()
    std_last_it = dict()
    
    #HeALER
    f1_healer = results_healer['F1_'+result_type]
    auc_results['HeALER'] = getAUC(f1_healer,ignore_first)  
    f1_last_it['HeALER'] = f1_healer.tail(1).values[0]
    f1_at_85['HeALER'] = f1_healer[85]
    f1_at_150['HeALER'] = f1_healer[150]
    std_last_it['HeALER'] = results_healer['F1_'+result_type+'_std'].tail(1).values[0]
 
    
    #ALMSER
    f1_almser = results_almser['F1_'+result_type]
    auc_results['ALMSER'] = getAUC(f1_almser,ignore_first)  
    f1_last_it['ALMSER'] = f1_almser.tail(1).values[0]
    f1_at_85['ALMSER'] = f1_almser[85]
    f1_at_150['ALMSER'] = f1_almser[150]
    std_last_it['ALMSER'] = results_almser['F1_'+result_type+'_std'].tail(1).values[0]
 
     #ALMSERgroup
    f1_almsergroup = results_almser_group['F1_'+result_type]
    auc_results['ALMSERgroup'] = getAUC(f1_almsergroup,ignore_first)  
    f1_last_it['ALMSERgroup'] = f1_almsergroup.tail(1).values[0]
    f1_at_85['ALMSERgroup'] = f1_almsergroup[85]
    f1_at_150['ALMSERgroup'] = f1_almsergroup[150]
    std_last_it['ALMSERgroup'] = results_almser_group['F1_'+result_type+'_std'].tail(1).values[0]
 
    
    for method in methods:
        all_results = all_results.append({'task':setting_path,'method':method, 'AUC':auc_results[method],
                                          'F1_85':f1_at_85[method],'F1_150':f1_at_150[method], 
                                          'F1_last_it': f1_last_it[method], 'STD_last_it':std_last_it[method]}, ignore_index=True)

display(all_results)

# 3. Calculate VH and EO profiling dimensions

In [None]:
#todo calculate value heterogeneity for each data source
for setting_path in listdir(main_path):
    #read train test
    train_ = pd.read_csv(main_path+'/'+setting_path+'/train_pairs_fv.csv')
    test_ = pd.read_csv(main_path+'/'+setting_path+'/test_pairs_fv.csv')
    pairs_fv = pd.concat([train_,test_])    
    matching_pairs = pairs_fv[pairs_fv.label]

    print("Matching Pairs: ", matching_pairs.shape[0])

    #read all sources
    data_sources = dict()
    for source_path in listdir(main_path+'/'+setting_path+'/sources'):
        print("Read source ",source_path)
        source_name = source_path.replace('.csv','')
        if 'computers' in setting_path: sep=';'
        else: sep=','
        source_pd = pd.read_csv(main_path+'/'+setting_path+'/sources/'+source_path, sep=sep)
        data_sources[source_name] = source_pd
    
    Graphtype = nx.Graph()
    G = nx.from_pandas_edgelist(matching_pairs, source= 'source', target='target', create_using=Graphtype)

    con_components = list(nx.connected_components(G))
    print("Entities: ", len(con_components))   
    
    #identifying attributes
    id_attr = []
    
    if 'lspc_computers_mutated' in setting_path : id_attr=['Part Number','title']
    elif 'lspc_computers' in setting_path : id_attr=['Part Number']
    elif 'restaurants' in setting_path : id_attr=['address','name']
    elif 'musicbrainz20K_mutated' in setting_path: id_attr=['title','length','artist','number']
    else : id_attr=['title','length','artist','album']

    hetereogeneous_entities = 0
    con_components_lengths = []
    print("ID attributes:", id_attr)

    for c in nx.connected_components(G):
        con_components_lengths.append(len(c))
        entity_nodes=pd.DataFrame()
        for node_ in G.subgraph(c).nodes:
           
            node_id= node_.split('_')[-1]
            node_source= node_.replace('_'+node_id,'')
            if 'restaurant' in setting_path:
                node_desc = copy(data_sources[node_source][data_sources[node_source].id==node_][id_attr])
            else: 
                node_desc = copy(data_sources[node_source][data_sources[node_source].id==int(node_id)][id_attr])
            entity_nodes =pd.concat([entity_nodes,node_desc])
        is_heter=True
        for id_attr_ in id_attr:
            if len(set(list(entity_nodes[id_attr_].values)))<entity_nodes.shape[0]:
                is_heter=False
                
                continue;
                        
        if is_heter: hetereogeneous_entities+=1

    print(Counter(con_components_lengths))

    print("Heteregoneous entities:%i (%.2f)" % (hetereogeneous_entities,hetereogeneous_entities/len(con_components)))
display(all_results)

