# Profile each matching task of the setting along the five dimensions:
1. Schema Complexity : number of relevant attributes
2. Sparsity : ratio of missing values to all attribute values of all relevant attributes
3. Size : size of training and validation set
4. Corner Cases : apply the corner cases with optimal threshold heuristic and calculate (#false_positives + #false_negatives)/ matching_pairs
5. Textuality : average length value in words of top relevant attributes

In [2]:
get_matching_task_profile_info()

importantProfilingDimensions()
#display(matching_tasks_profiling)
#matching_tasks_summary


Current dataset pair 2_1
Current dataset pair 2_3
Current dataset pair 2_4
Current dataset pair 2_5
Current dataset pair 3_1
Current dataset pair 4_1
Current dataset pair 4_5
Current dataset pair 4_3
Current dataset pair 5_1
Current dataset pair 5_3


Unnamed: 0,Dataset pair,F1-xval_all_attr,Relevant Attributes,Top Features,Schema Complexity,Textuality,Sparsity,Size,Match#,Corner Cases
0,2_1,0.98,"[album*, length*, title, year, number]","[album_lev, album_token_jaccard, album_relaxed...",5,3.77,0.08,38270,1984,0.22
1,2_3,0.85,"[title*, number*, length, year, language]","[title_relaxed_jaccard, title_jaccard, title_c...",5,7.24,0.12,39260,2030,0.39
2,2_4,0.98,"[length*, album*, title, number]","[length_sim, album_lev]",4,4.26,0.05,38608,1244,0.42
3,2_5,0.98,"[album*, length*, title, number]","[album_relaxed_jaccard, album_lev, album_token...",4,3.77,0.06,38278,1284,0.19
4,3_1,0.98,"[title*, artist, length]",[title_relaxed_jaccard],3,7.49,0.05,38270,1208,0.08
5,4_1,0.99,"[length*, album*, artist, title]","[length_sim, length_num_equal, album_lev]",4,4.2,0.06,38270,1216,0.16
7,4_3,0.97,"[artist*, title*, number, length, language]","[artist_lev, title_containment, title_jaccard]",5,7.81,0.06,39246,2002,0.18
6,4_5,0.99,"[title*, album*, length, artist, number]","[title_relaxed_jaccard, album_lev]",5,7.54,0.06,38261,1980,0.11
8,5_1,0.99,"[album*, length*, artist, title, year]","[album_relaxed_jaccard, album_lev, album_token...",5,3.74,0.07,38186,1969,0.2
9,5_3,0.96,"[title*, artist*, number, length]","[title_jaccard, title_relaxed_jaccard, title_c...",4,7.76,0.06,39260,1262,0.16


In [1]:
from datautils import*
import os
import os.path as path
from learningutils import *
from sklearn import tree
from matching_task import *
import time
import glob


matching_tasks_summary = pd.DataFrame(columns=['Dataset', '#records_source', '#records_target', 'count_record_pairs', '#match', '#non-match',
                                               'count_attr','#short_string_attr', '#long_string_attr', '#numeric_attr','avg_density_all'])
matching_tasks_baseline_rf_results = pd.DataFrame(columns=['Dataset', 'precision','recall','f1','f1_std','proba_scores',
                                                          'proba_scores_std','x-val f1','x-val f1 sigma'])
matching_tasks_baseline_svm_results = pd.DataFrame(columns=['Dataset', 'precision','recall','f1','f1_std','proba_scores',
                                                      'proba_scores_std','x-val f1','x-val f1 sigma'])

matching_tasks_profiling = pd.DataFrame(columns=['Dataset','F1_xval_max', 'F1_xval_top_matching_relevant_features', 
                                                 'matching_relevant_features', 
   'matching_relevant_attributes','matching_relevant_attributes_density','matching_relevant_attributes_count',
 'matching_relevant_attributes_datatypes','top_matching_relevant_features','top_relevant_attributes', 
 'top_relevant_attributes_count','top_relevant_attributes_datatypes', 'top_relevant_attributes_density',
'avg_length_tokens_top_relevant_attributes','avg_length_words_top_relevant_attributes','corner_cases_top_matching_relevant_features'])
 
# Use the flags below to indicate which results should be calculates
summaryFeatures=True
baselineResults=False
profilingFeatures = True                                                           


main_path = '../datasets/musicbrainz20K/'
source_folder="sources"
fv_folder = "feature_vector_files/"

#add the correct separators of the source sets and the gold standard
sep_for_source_files= ','
gs_sep = ','
train_test_val=False # otherwise nested x-validation for baseline experiments
fv_name_split = "_"
#change for allowing multithreading
threads=-1

def get_matching_task_profile_info():
     
    dat_counter = 0
    for f in glob.glob(main_path+fv_folder+"/*"):
        dataset_name = f.split("/")[-1].replace(".csv","")
        print("Current dataset pair %s" %dataset_name)
        ds1_name = dataset_name.split(fv_name_split)[0]
        ds2_name = dataset_name.split(fv_name_split)[1]
        
        feature_vector = pd.read_csv(f)
       

        gs = feature_vector[['source_id','target_id','label']].copy()
        gs.rename(columns={'label':'matching'}, inplace=True)

        ds1= pd.read_csv(main_path+source_folder+"/"+ds1_name+".csv", sep =sep_for_source_files, engine='python')
        ds2= pd.read_csv(main_path+source_folder+"/"+ds2_name+".csv", sep =sep_for_source_files, engine='python')
        
        #ds1.drop(columns=['cluster_id'], inplace=True)

        ds1.rename(columns={'id':'subject_id'}, inplace=True)
        #ds2.drop(columns=['cluster_id'], inplace=True)
        ds2.rename(columns={'id':'subject_id'}, inplace=True)

        if not ds1.empty and not ds2.empty and not gs.empty:
            ds1['subject_id'] = ds1['subject_id'].apply(str)
            ds2['subject_id'] = ds2['subject_id'].apply(str)


            gs['source_id'] = gs['source_id'].apply(str)
            gs['target_id'] = gs['target_id'].apply(str)

            common_attributes = [value for value in ds1.columns if (value in ds2.columns and value!='subject_id')] 
            matching_task = MatchingTask(ds1, ds2, gs, feature_vector, common_attributes)

            if (summaryFeatures):
                matching_task.getSummaryFeatures()
                #correspondes features
                summary_features = matching_task.dict_summary
                summary_features['Dataset'] = dataset_name

                for key in matching_tasks_summary.columns:
                    matching_tasks_summary.loc[dat_counter, key] = summary_features.get(key)


            if (baselineResults):
                # get baseline results
                if (train_test_val): 
                    print("Evaluation with train_validation_test split")
                    matching_task.getSplitValidationResults(model="linear")
                    matching_task.getSplitValidationResults(model="non-linear")
                else:
                    print("Evaluation with Nested-X-Validation (no splits will be considered) - slow for large tasks")
                    matching_task.getNestedXValidationResults(model="linear")
                    matching_task.getNestedXValidationResults(model="non-linear")

                #linear model results
                linear_results = matching_task.dict_linear_results
                linear_results['Dataset'] = dataset_name
                for key in linear_results:
                    matching_tasks_baseline_svm_results.loc[dat_counter, key] = linear_results.get(key)
                #non-linear model results
                non_linear_results = matching_task.dict_non_linear_results
                non_linear_results['Dataset'] = dataset_name
                for key in non_linear_results:
                    matching_tasks_baseline_rf_results.loc[dat_counter, key] = non_linear_results.get(key)

            if(profilingFeatures):
                matching_task.getProfilingFeatures()
                ident_features_profile =  matching_task.dict_profiling_features
                ident_features_profile['Dataset'] = dataset_name
                for key in matching_tasks_profiling.columns:
                    matching_tasks_profiling.loc[dat_counter,key] = ident_features_profile.get(key)                                  


            dat_counter+=1

    
def displaySaveResults(): 
    timestr = time.strftime("%m%d_%H%M%S")
    if (profilingFeatures):
        display(matching_tasks_summary)
        matching_tasks_summary.to_csv(main_path+source_folder+'/'+timestr+'matching_tasks_profiling.csv', index=False)

    if (baselineResults):
        display(matching_tasks_svm_results)
        display(matching_tasks_rf_results)
        matching_tasks_rf_results.to_csv(main_path+source_folder+'/'+timestr+'matching_tasks_RF_results.csv', index=False)
        matching_tasks_svm_results.to_csv(main_path+source_folder+'/'+timestr+'matching_tasks_SVM_results.csv', index=False)

    if(profilingFeatures):
        display(matching_tasks_profiling)
        matching_tasks_profiling.to_csv(main_path+source_folder+'/'+timestr+'matching_tasks_profiling_features_summary.csv', index=False)

      
def importantProfilingDimensions():
    profiling_dimensions = pd.DataFrame(columns=['Dataset'])
    profiling_dimensions['Dataset'] = matching_tasks_summary.Dataset
    profiling_dimensions['Size'] = matching_tasks_summary.count_record_pairs
    profiling_dimensions['Match#'] = matching_tasks_summary['#match']
    profiling_dimensions = pd.merge(profiling_dimensions, matching_tasks_profiling)
    
    for index, row in profiling_dimensions.iterrows():
        relev_attr = row['matching_relevant_attributes']
        
        top_relev_attr = row['top_relevant_attributes']
        
        format_relev_attr = []
        for ra in relev_attr:
            if ra in top_relev_attr: format_relev_attr.append(ra+"*")
            else: format_relev_attr.append(ra)
        profiling_dimensions.loc[index,'matching_relevant_attributes']=format_relev_attr
        
    columns = ['Dataset pair', 'F1-xval_all_attr', 'Relevant Attributes', 'Top Features', 'Schema Complexity', 'Textuality', 'Sparsity', 'Size', 'Match#', 'Corner Cases']
    profiling_dimensions.rename(columns={'Dataset':columns[0], 'F1_xval_max':columns[1],'matching_relevant_attributes':columns[2], 'top_matching_relevant_features':columns[3],
                                       'matching_relevant_attributes_count':columns[4], 'avg_length_words_top_relevant_attributes':columns[5],
                                        'matching_relevant_attributes_density':columns[6], 'corner_cases_top_matching_relevant_features':columns[9]}, inplace=True)
    
    profiling_dimensions= profiling_dimensions[columns]
    profiling_dimensions['Sparsity'] = 1-profiling_dimensions['Sparsity']
    profiling_dimensions.sort_values(by=['Dataset pair'], inplace=True)  
    display(profiling_dimensions)
    profiling_dimensions.to_csv(main_path+"profiling.csv", index=False)