# Specifications

Execute a single ALMSER experiment

In [2]:
import os

In [5]:
path = "datasets/magellan_restaurants/feature_vector_files/"
output_path = "datasets/magellan_restaurants/almser/"
fv_splitter = "__"

In [6]:
# Active Learning Settings
max_queries =25
runs = 1
query_strategy = 'disagreement' #disagreement, random, disagreeement_stratified, disagreement_post_graph
files = os.listdir(path)
files =[f.replace('.csv','') for f in files]

# Passive Learning Results

In [None]:
from learningutils import *
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter 

pairs_fv_train= pd.read_csv(output_path+"train_pairs_fv.csv")
pairs_fv_test= pd.read_csv(output_path+"test_pairs_fv.csv")

metadata_columns = ['source_id','target_id','pair_id', 'agg_score','source','target', 'label']
train_X = pairs_fv_train.drop(metadata_columns, axis=1)
train_y = pairs_fv_train['label']

test_X = pairs_fv_test.drop(metadata_columns, axis=1)
test_y = pairs_fv_test['label']


model = getClassifier('rf')
model.fit(train_X,train_y)
predictions = model.predict(test_X)
prec, recall, fscore, support  = precision_recall_fscore_support(test_y, predictions, average='binary')

print("Passive learing results: %f P, %f R, %f F1" % (prec,recall,fscore))

# Load the stored files and start ALMSER

In [None]:
from scoreaggregation import *
from ALMSER import *
from ALMSER_EXP import *


almser_path = output_path
print(almser_path)

pairs_fv_train= pd.read_csv(almser_path+"train_pairs_fv.csv")

pairs_fv_train['datasource_pair'] = pairs_fv_train['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_train['target'].str.rsplit('_', 1).str[0]

pairs_fv_test= pd.read_csv(almser_path+"test_pairs_fv.csv")
pairs_fv_test['datasource_pair'] = pairs_fv_test['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_test['target'].str.rsplit('_', 1).str[0]

all_nodes_test_match = set(pairs_fv_test[pairs_fv_test.label]['source'].values)
all_nodes_test_match.update(set(pairs_fv_test[pairs_fv_test.label]['target'].values))

all_nodes_train_match = set(pairs_fv_train[pairs_fv_train.label]['source'].values)
all_nodes_train_match.update(set(pairs_fv_train[pairs_fv_train.label]['target'].values))

print("Intersection:", all_nodes_train_match.intersection(all_nodes_test_match))

unique_source_pairs = files
results_concat = pd.DataFrame(columns=['P_model','R_model','F1_model_micro','F1_model_macro','F1_model_micro_boot','F1_model_macro_boot'])
results_all = pd.DataFrame(columns=['P','P_std','R','R_std','F1_micro','F1_micro_std','F1_macro','F1_macro_std','F1_micro_boot',
                                   'F1_micro_boot_std','F1_macro_boot', 'F1_macro_boot_std' ])

for run in range(runs):
    print("RUN %i" % run)
    
    almser_exp = ALMSER_EXP(pairs_fv_train, pairs_fv_test, unique_source_pairs, max_queries, 'rf',
                        query_strategy, fv_splitter, bootstrap=True)
    almser_exp.run_AL()
    
    results_concat= pd.concat((results_concat,(almser_exp.results[['P_model','R_model','F1_model_micro','F1_model_macro','F1_model_micro_boot','F1_model_macro_boot','F1_model_micro_boost_graph']])))

results_concat_by_row_index = results_concat.groupby(results_concat.index)
results_concat_mean =results_concat_by_row_index.mean(numeric_only=False) 
results_concat_std =results_concat_by_row_index.apply(np.std)


results_all['P'] = results_concat_mean['P_model']
results_all['P_std'] = results_concat_std['P_model']
results_all['R'] = results_concat_mean['R_model']
results_all['R_std'] = results_concat_std['R_model']
results_all['F1_micro'] = results_concat_mean['F1_model_micro']
results_all['F1_micro_std'] = results_concat_std['F1_model_micro']
results_all['F1_macro'] = results_concat_mean['F1_model_macro']
results_all['F1_macro_std'] = results_concat_std['F1_model_macro']
results_all['F1_micro_boot'] = results_concat_mean['F1_model_micro_boot']
results_all['F1_micro_boot_std'] = results_concat_std['F1_model_micro_boot']
results_all['F1_macro_boot'] = results_concat_mean['F1_model_macro_boot']
results_all['F1_macro_boot_std'] = results_concat_std['F1_model_macro_boot']
results_all['F1_model_micro_boost_graph'] = results_concat_mean['F1_model_micro_boost_graph']
results_all['F1_model_micro_boost_graph_std'] = results_concat_std['F1_model_micro_boost_graph']


In [None]:
import networkx as nx
from scoreaggregation import *
from ALMSER import *
from ALMSER_EXP import *
from networkx.algorithms.flow import *


G = nx.Graph()

G.add_edge("a", "c", capacity=0.2)
G.add_edge("c", "d", capacity=0.1)
G.add_edge("c", "e", capacity=0.7)
G.add_edge("e", "f", capacity=0.9)
G.add_edge("a", "d", capacity=0.3)
G.add_edge("f", "b", capacity=3.0)


drawGraph(G)

cut_value, partition = nx.minimum_cut(G, 'd','b')
reachable, non_reachable = partition

print(partition)
print(reachable)
print(non_reachable)

In [None]:
#2_13222
from networkx.algorithms import community
from networkx.algorithms.community import greedy_modularity_communities

cc_of_node = nx.node_connected_component(almser_exp.G, '2_13222')
print("Connected component size:  ",len(cc_of_node))

subg = almser_exp.G.subgraph(cc_of_node)
pos = nx.spring_layout(subg)
plt.figure(3,figsize=(15,12))   
nx.draw(subg,pos,edge_color='black',node_size=500,node_color='pink',alpha=0.9,linewidths=1,
        labels={node:node for node in subg.nodes()})
edge_labels = nx.get_edge_attributes(subg,'capacity')
nx.draw_networkx_edge_labels(subg,pos,edge_labels=edge_labels,font_color='red')
plt.axis('off')
plt.show()



In [None]:
from networkx.algorithms.flow import *
check = almser_exp.labeled_set[almser_exp.labeled_set.label==False]
cut_labels = []
cuts_all= []
for index,row in check.iterrows():
    if (almser_exp.G.has_node(row.source) and almser_exp.G.has_node(row.target)): 
        if (nx.has_path(almser_exp.G, row.source,row.target)):
            print("Found path between : %s - %s " %(row.source,row.target))
            cc_of_node = nx.node_connected_component(almser_exp.G, row.source)
            print("Connected component size:  ",len(cc_of_node))
            
            subg = almser_exp.G.subgraph(cc_of_node)
            drawGraph(subg)
            
            cut_weight, partitions = nx.minimum_cut(subg, row.source, row.target)
            
            edge_cut_list = [] # Computed by listing edges between the 2 partitions
            for p1_node in partitions[0]:
                for p2_node in partitions[1]:
                    if subg.has_edge(p1_node,p2_node):
                        edge_cut_list.append((p1_node,p2_node))
                        cut_label = almser_exp.unlabeled_set_metadata[((almser_exp.unlabeled_set_metadata.source==p1_node) & (almser_exp.unlabeled_set_metadata.target==p2_node)) |  ((almser_exp.unlabeled_set_metadata.source==p2_node) & (almser_exp.unlabeled_set_metadata.target==p1_node))].label
                        
                        print("Cut label", cut_label)
            
almser_exp.G.remove_edges_from(edge_cut_list)
print(edge_cut_list)

In [None]:
task = 'usapartsdirect_harddrivesdirect'
record_pairs_train = almser_exp.labeled_set[almser_exp.labeled_set.datasource_pair == task]

train_X, train_y = almser_exp.get_feature_vector_subset(record_pairs_train)
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.33, random_state=42)
model = getClassifier(almser_exp.classifier_name, n_estimators=10, random_state=1)
model.fit(X_train,y_train)
pred = model.predict(X_test).tolist()
accur = accuracy_score(y_test, pred)
print(accur)

print(task)
total_dis = almser_exp.unlabeled_set.disagreement.values.sum()
print(total_dis)
almser_exp.unlabeled_set[almser_exp.unlabeled_set.datasource_pair==task].disagreement.values.sum()/total_dis

In [None]:
tail_ = almser_exp.labeled_set.tail(100)

tail_[tail_.predicted_label==tail_.graph_inferred_label].shape[0]

In [None]:
almser_exp.unlabeled_set.sort_values('inf_score', ascending=False).head(10)

In [None]:
almser_exp.unlabeled_set['label'] = almser_exp.unlabeled_set_metadata.label
check = almser_exp.unlabeled_set[almser_exp.unlabeled_set.predicted_label!=almser_exp.unlabeled_set.label]
check = check[['votes','disagreement', 'graph_inferred_label','clusters_inferred_label','predicted_label','label']]
display(check.head(10))
low_disagr = check[check.disagreement>-1.0]
print("Low disagreement:", low_disagr.shape[0])
print("Correct only graph:", low_disagr[low_disagr.graph_inferred_label==low_disagr.label].shape[0])
print("Correct only clusters:", low_disagr[low_disagr.clusters_inferred_label==low_disagr.label].shape[0])
print("Correct only predicted:", low_disagr[low_disagr.predicted_label==low_disagr.label].shape[0])
print("Correct only dt:", low_disagr[low_disagr.label==[votes[1] for votes in low_disagr.votes]].shape[0])
print("Correct only gboost:", low_disagr[low_disagr.label==[votes[2] for votes in low_disagr.votes]].shape[0])
print("Correct only logr:", low_disagr[low_disagr.label==[votes[3] for votes in low_disagr.votes]].shape[0])
print("Correct only svm:", low_disagr[low_disagr.label==[votes[4] for votes in low_disagr.votes]].shape[0])


In [None]:
#write results
from datetime import datetime

now = datetime.now()
timestamp= now.strftime("%d_%m_%H_%M")
filename = "%i_runs_%i_iter_%s_%s" %(runs,max_queries,query_strategy,timestamp)

almser_exp.results.to_csv(output_path+filename+"_ALL.csv", index=False)
almser_exp.labeled_set.to_csv(output_path+filename+"_LABELED_SET_INFO.csv", index=False)
almser_exp.informants_eval.to_csv(output_path+filename+"_INFORMANTS_EVAL.csv", index=False)
results_all.to_csv(output_path+filename+".csv", index=False)


In [None]:
import seaborn as sns;
almser_exp.get_heatmap_of_iteration()[0]

heatm = almser_exp.get_heatmap_of_iteration()[0].fillna(value=1)
ax = sns.heatmap(heatm, annot=True)


In [None]:
prec, recall, fscore, support  = precision_recall_fscore_support(almser_exp.unlabeled_set_metadata.label, almser_exp.unlabeled_set.clusters_inferred_label, average='binary')
print("Clusters Precision ", prec)
print("Clusters Recall ", recall)
print("Clusters Fscore ", fscore)

In [None]:
from networkx.algorithms import community
communities_generator = community.girvan_newman(almser_exp.G)
top_level_communities = next(communities_generator)


In [None]:
top_com = sorted(map(sorted, top_level_communities))
pos_pairs = []
for com in top_com:
    for i in range(len(com)):
        for j in range(i+1,len(com)):
            pos_pairs.append((com[i],com[j]))

In [None]:
unlabeled_data = almser_exp.get_feature_vector_subset(almser_exp.unlabeled_set, getLabels=False)

predicted_labels = almser_exp.learning_models['all'].predict(unlabeled_data)
almser_exp.unlabeled_set['clusters_inferred_label'] = False
for ind, row in almser_exp.unlabeled_set[predicted_labels].iterrows():
    if (row.source,row.target) in pos_pairs or (row.target,row.source) in pos_pairs:
        almser_exp.unlabeled_set.at[ind, 'clusters_inferred_label']=True 

In [None]:
Counter(almser_exp.labeled_set.cc_size)

In [None]:
#analysis if task frequency, f1, transferrability on the 150 iteration
further_stats= pd.DataFrame(columns=['Task','Annotations','Task-F1'])

freq_counter = dict(Counter(almser_exp.labeled_set.datasource_pair))
for i in range(len(almser_exp.unique_source_pairs)):
    task = almser_exp.unique_source_pairs[i]
    f1_score_task = almser_exp.results.tail(1).F1_pairwise_model.values[0].get(task)
    freq = freq_counter.get(task)
    space_coverage=almser_exp.unlabeled_set[almser_exp.unlabeled_set.datasource_pair==task].head(1).dataspace_coverage.values[0]
    explore_score = almser_exp.unlabeled_set[almser_exp.unlabeled_set.datasource_pair==task].head(1).explore_score.values[0]
    exploit = almser_exp.unlabeled_set[almser_exp.unlabeled_set.datasource_pair==task].head(1).exploit_score.values[0]

    further_stats = further_stats.append({'Task': task, 'Annotations': freq,'Task-F1': f1_score_task, 'Explore_Score': explore_score, 'Exploit_Score':exploit}, ignore_index=True)

further_stats
#further_stats.to_csv(output_path+"FURTHER_STATS_"+filename+".csv", index=False)