In [None]:
%matplotlib inline
from datautils import*
from IPython.display import Markdown, display
import numpy as np
from preprocessing import*
from learning import*
from bootstrapping import*
from bootstrapping_unsupervised import*
from similarityutils import*
from collections import Counter
import pandas as pd

# run thresholding comparison
thresholding()


In [None]:
#start from here
def thresholding():
   
    #set these parameters   
    bootstrap_methods = ['attrelbow_density','attrstatic_density', 'attrotsu_density','attrvalley_density']
    threshold_labels = ['elbow', 'static', 'Otsu\'s', 'valley']
    #threshold_colors = ['blue', 'green', 'orange', 'magenta']
    threshold_colors = [ '#56B4E9',"#E69F00" ,'#009E73', '#CC79A7']
    threshold_linestyles = ['dotted', None, 'dashdot', '--']
    threshold_text = [50, 100, 150, 200]
    thresholds = []
    model = 'rf'
    dataPathmain = "../datasets/author"
    domain = 'product'
    datasets = [ 'DBPediaAuthors_DnbDataAuthors']
        
      
    for dataset in datasets:
        model_ = []
        method_ = []
        precision_=[]
        precision_sigma=[]
        recall_=[]
        recall_sigma=[]
        f1_=[]
        f1_sigma = []
        bootstrap_sample_correctness = []
        bootstrap_sample_f1 = []

        dataset_=[]
        
        #get data and profiling information
        display("Unsupervised Matching: %s" %dataset)
        
        featureFile_train = dataPathmain+'/features_'+dataset+'_train'
        featureFile_test = dataPathmain+'/features_'+dataset+'_test'
        
        print("Stats about pool data")
        trainingData = getLabelledDataFromFile(featureFile_train, rescale=True)
        print("Profiling Density")
        prof_results = dict()
        data_values = trainingData['feature_values']
        tobedropped=[]
        for c in data_values.columns:
            empty_values = len(data_values[data_values[c] == -1])
            per= float(empty_values)/float(len(data_values[c]))
            density = 1-per
           
            #drop non-dense attributes for the wdc datasets
            if 'wdc_product' in domain:
                column_name = c
            else: column_name = c.split("_")[0]
            if (density<0.1):tobedropped.append(c)
            else: prof_results[column_name]= ("%.3f" %density)
        
        if 'wdc_product' in domain:
            print ("Columns to be dropped:", len(tobedropped))
            trainingData['feature_values'] = trainingData['feature_values'].drop(tobedropped, axis=1)
        display(prof_results)
        print("Stats about validation data")
        validationData = getLabelledDataFromFile(featureFile_test, rescale=True)
        if 'wdc_product' in domain:
            validationData['feature_values'] = validationData['feature_values'].drop(tobedropped, axis=1)

        
        X = trainingData['feature_values']
        y = trainingData['labels']
        ids  = trainingData['ids']
        bootstrap_sample_correctness
    
        #get results for supervised matching
        prec, recall, fscore, support = batchTraining(X,y,validationData['feature_values'],validationData['labels'],model, printResults = False,
                                                           optimization=False)
        precision_.append("%.3f" %prec)
        recall_.append("%.3f" %recall)
        f1_.append("%.3f" %fscore)
        model_.append(model)
        
        dataset_.append(dataset)
        method_.append('Passive Supervised Learning')
        bootstrap_sample_correctness.append("-")
        bootstrap_sample_f1.append("-")
        for m in bootstrap_methods:

            bootstrap = BootstrappingUnsupervised(data=X, labels=y, ids=ids, bootstrap_method=m, domain=domain)
            sorted_dataset = bootstrap.sorted_dataset
            thresholds.append(bootstrap.threshold)

            bootstrapping_sample = bootstrap.sample
            bootstrap_sample_correctness.append(bootstrapping_sample['correctness'])
            bootstrap_sample_f1.append(bootstrapping_sample['f1'])

            prec, recall, fscore, support = batchTraining(bootstrapping_sample['data'],bootstrapping_sample['labels'],
                                                          validationData['feature_values'],validationData['labels'],model, printResults = False, optimization=False,
                                                          showMisclassifications = False, ids = ids)

            print("Method %s gives %f prec, %f recall and %f f1 if noisy data are used to train a RF" % (m, prec, recall, fscore))
            
            method_.append(m)
            model_.append(model)
            precision_.append("%.3f" %prec)

            recall_.append("%.3f" %recall)

            f1_.append("%.3f" %fscore)

            dataset_.append(dataset)

        results = list(zip(dataset_ ,bootstrap_sample_correctness, bootstrap_sample_f1,method_,precision_,recall_, f1_))
        df_results = pd.DataFrame(results, columns = ['Dataset',  'Bootstrap sample correctness', 'Bootstrap sample f1','Method', 'Precision', 'Recall','F1- Random Forest']) 
        display(df_results)
        
        simple_hist, _ = np.histogram(sorted_dataset, bins=100, range=(0.0, 1.0))
        fig, ax = plt.subplots()
        ax.plot(np.arange(0.00,1.0,0.01),simple_hist, c='#000000')
        for i in range(len(threshold_colors)):
            ax.axvline(x=thresholds[i], c= threshold_colors[i], linestyle =  threshold_linestyles[i], label=threshold_labels[i])
            #ax.text(thresholds[i],threshold_text[i],threshold_labels[i],rotation=90)
        
        ax.legend(fontsize=12)
        plt.xlabel('similarity scores')
        plt.ylabel('# record pairs')
        plt.savefig('../results/graphs/%s_threshold_comparison.pdf' % dataset, bbox_inches='tight', format='pdf')
        
        