# Specifications

Execute a single ALMSER experiment

In [1]:
import os

In [2]:
path = "../datasets/musicbrainz20K/feature_vector_files/"
output_path = "../datasets/musicbrainz20K/almser/"
fv_splitter = "_"

In [3]:
# Active Learning Settings
max_queries =25
runs = 1
query_strategy = 'disagreement_post_graph' #disagreement, random, disagreeement_stratified, disagreement_post_graph
files = os.listdir(path)
files =[f.replace('.csv','') for f in files]

# Passive Learning Results

In [None]:
from learningutils import *
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter 

pairs_fv_train= pd.read_csv(output_path+"train_pairs_fv.csv")
pairs_fv_test= pd.read_csv(output_path+"test_pairs_fv.csv")

metadata_columns = ['source_id','target_id','pair_id', 'agg_score','source','target', 'label']
train_X = pairs_fv_train.drop(metadata_columns, axis=1)
train_y = pairs_fv_train['label']

test_X = pairs_fv_test.drop(metadata_columns, axis=1)
test_y = pairs_fv_test['label']


model = getClassifier('rf')
model.fit(train_X,train_y)
predictions = model.predict(test_X)
prec, recall, fscore, support  = precision_recall_fscore_support(test_y, predictions, average='binary')

print("Passive learing results: %f P, %f R, %f F1" % (prec,recall,fscore))

# Load the stored files and start ALMSER

In [4]:
from scoreaggregation import *
from ALMSER import *
from ALMSER_EXP import *


almser_path = output_path
print(almser_path)

pairs_fv_train= pd.read_csv(almser_path+"train_pairs_fv.csv")

pairs_fv_train['datasource_pair'] = pairs_fv_train['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_train['target'].str.rsplit('_', 1).str[0]

pairs_fv_test= pd.read_csv(almser_path+"test_pairs_fv.csv")
pairs_fv_test['datasource_pair'] = pairs_fv_test['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_test['target'].str.rsplit('_', 1).str[0]

all_nodes_test_match = set(pairs_fv_test[pairs_fv_test.label]['source'].values)
all_nodes_test_match.update(set(pairs_fv_test[pairs_fv_test.label]['target'].values))

all_nodes_train_match = set(pairs_fv_train[pairs_fv_train.label]['source'].values)
all_nodes_train_match.update(set(pairs_fv_train[pairs_fv_train.label]['target'].values))

print("Intersection:", all_nodes_train_match.intersection(all_nodes_test_match))

unique_source_pairs = files
results_concat = pd.DataFrame(columns=['P_model','R_model','F1_model_micro','F1_model_macro','F1_model_micro_boot','F1_model_macro_boot'])
results_all = pd.DataFrame(columns=['P','P_std','R','R_std','F1_micro','F1_micro_std','F1_macro','F1_macro_std','F1_micro_boot',
                                   'F1_micro_boot_std','F1_macro_boot', 'F1_macro_boot_std' ])

for run in range(runs):
    print("RUN %i" % run)
    
    almser_exp = ALMSER_EXP(pairs_fv_train, pairs_fv_test, unique_source_pairs, max_queries, 'rf',
                        query_strategy, fv_splitter, bootstrap=True)
    almser_exp.run_AL()
    
    results_concat= pd.concat((results_concat,(almser_exp.results[['P_model','R_model','F1_model_micro','F1_model_macro','F1_model_micro_boot','F1_model_macro_boot','F1_model_micro_boost_graph']])))

results_concat_by_row_index = results_concat.groupby(results_concat.index)
results_concat_mean =results_concat_by_row_index.mean(numeric_only=False) 
results_concat_std =results_concat_by_row_index.apply(np.std)


results_all['P'] = results_concat_mean['P_model']
results_all['P_std'] = results_concat_std['P_model']
results_all['R'] = results_concat_mean['R_model']
results_all['R_std'] = results_concat_std['R_model']
results_all['F1_micro'] = results_concat_mean['F1_model_micro']
results_all['F1_micro_std'] = results_concat_std['F1_model_micro']
results_all['F1_macro'] = results_concat_mean['F1_model_macro']
results_all['F1_macro_std'] = results_concat_std['F1_model_macro']
results_all['F1_micro_boot'] = results_concat_mean['F1_model_micro_boot']
results_all['F1_micro_boot_std'] = results_concat_std['F1_model_micro_boot']
results_all['F1_macro_boot'] = results_concat_mean['F1_model_macro_boot']
results_all['F1_macro_boot_std'] = results_concat_std['F1_model_macro_boot']
results_all['F1_model_micro_boost_graph'] = results_concat_mean['F1_model_micro_boost_graph']
results_all['F1_model_micro_boost_graph_std'] = results_concat_std['F1_model_micro_boost_graph']


datasets/musicbrainz20K/almser/
Intersection: set()
RUN 0
Bootstrap model
Bootstrap labeled set
Start ALMSER
ALMSER Mode: Active Learning |████████████████████████████████████████████████████████████████████████████████████----------------| 84.0% source                                                  4_8654
target                                                 5_12478
datasource_pair                                            4_5
votes                        (False, True, True, False, False)
disagreement                                               0.4
datasource_pair_frequency                                  NaN
inf_score                                                  0.4
graph_inferred_label                                      True
predicted_label                                          False
graph_cc_size                                                3
sel_proba                                             0.333333
unsupervised_label                                        Tr

In [None]:
#write results
from datetime import datetime

now = datetime.now()
timestamp= now.strftime("%d_%m_%H_%M")
filename = "%i_runs_%i_iter_%s_%s" %(runs,max_queries,query_strategy,timestamp)

almser_exp.results.to_csv(output_path+filename+"_ALL.csv", index=False)
almser_exp.labeled_set.to_csv(output_path+filename+"_LABELED_SET_INFO.csv", index=False)
almser_exp.informants_eval.to_csv(output_path+filename+"_INFORMANTS_EVAL.csv", index=False)
results_all.to_csv(output_path+filename+".csv", index=False)
