In [None]:
import networkx as nx
import math
import argparse
import torch
from torch.utils.data import DataLoader
import data_process.split_data as st
import data_process.data_loader as dl
from model.sbert import SentenceTransformer, losses
from model.sbert.evaluation import EmbeddingSimilarityEvaluator
import compute_metrics.metric as ms
from parse_config import ConfigParser
from model.utils import PPRPowerIteration

In [None]:
torch.manual_seed(0)
config = {
    "batch_size" : 32,
    "epochs" : 1,
	"sampling" : "closest",
    "saving_path" : "../data/semeval_food/results/",
    "name" : "semeval_food",
    "data_path" : "../data/semeval_food/",
    "model_name" : "/codes/l/Modules/multi-qa-distilbert-cos-v1",
    "neg_number" : 20,
    "partition_pattern":"internal",
    "alpha":0.1,
    "seed":47
}

In [None]:
saving_path = config['saving_path']
name = config['name']
data_path = config['data_path']
sampling_method = config['sampling']
neg_number = config['neg_number']
partition_pattern = config['partition_pattern']
seed = config['seed']
batch_size = config['batch_size']
epochs = config['epochs']
alpha = config['alpha']

In [None]:
taxonomy = dl.TaxoDataset(name,data_path,raw=True,partition_pattern=partition_pattern,seed=seed)
data_prep = st.Dataset(taxonomy,sampling_method,neg_number,seed)
model_name = config['model_name']

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
target_device = torch.device(device)

In [None]:
if torch.cuda.is_available():
    model = SentenceTransformer.SentenceTransformer(model_name, device='cuda')
else:
    model = SentenceTransformer.SentenceTransformer(model_name)


In [None]:
g = torch.Generator()
g.manual_seed(0)

In [None]:
nodeIdsCorpus =[data_prep.corpusId2nodeId[idx] for idx in data_prep.corpusId2nodeId]
core_graph = data_prep.core_subgraph.copy()
core_graph.remove_node(data_prep.pseudo_leaf_node)
nodes_core_subgraph = list(core_graph.nodes)
assert nodes_core_subgraph == nodeIdsCorpus
propagation = PPRPowerIteration(nx.adjacency_matrix(core_graph), alpha=alpha, niter=10).to(target_device)


In [None]:
train_dataloader = DataLoader(data_prep.trainInput, shuffle=True, batch_size=batch_size)
warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1) #10% of train data for warm-up
train_loss = losses.CosineSimilarityLoss(model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(data_prep.val_examples, name='sts-dev')
# Tune the model

In [None]:
# corpus_embeddings = model.encode(data_prep.corpus, convert_to_tensor=True, show_progress_bar=True)
# preds = propagation(corpus_embeddings,torch.tensor(range(len(nodeIdsCorpus)),device=target_device))


In [None]:
# print(corpus_embeddings.shape)
print(len(data_prep.core_subgraph.edges))

In [None]:
data_prep.pseudo_leaf_node

In [None]:
# data_prep.valid_queries

In [None]:
# data_prep.valid_node_list

In [None]:
# data_prep.valid_node2pos

In [None]:
# data_prep.corpusId2nodeId

In [None]:
edges = data_prep.core_subgraph.edges
leaf_node = data_prep.pseudo_leaf_node
queries = data_prep.valid_queries
node_list = data_prep.valid_node_list
node2positions = data_prep.valid_node2pos
corpusId2nodeId = data_prep.corpusId2nodeId

In [None]:
import numpy as np

In [None]:
corpus_embeddings = model.encode(data_prep.corpus, convert_to_tensor=True, show_progress_bar=False)

In [None]:
# corpus_embeddings = torch.randn(11716,50)



In [None]:
print(corpus_embeddings.shape)

In [None]:
from sentence_transformers import util

In [None]:
top_k = 11716
all_targets = []
all_predictions = []
all_scores = []
all_edges_scores,edges_prediction = [],[]
edges_2darray = np.array([*list(edges)])
parent = edges_2darray[:,0]
children = edges_2darray[:,1]
for idx, query in enumerate(queries):
    query_id = node_list[idx]
    # print(idx)
    # print(query)
    # print(query_id)
    target_positions = node2positions[query_id]
    # print(target_positions)
    all_targets.append(target_positions)
    # print(all_targets)
    question_embedding = model.encode(query, convert_to_tensor=True)
    # print(question_embedding.shape)
    hits_score = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) 
    # print(type(hits_score))
    # print(type(hits_score[0][0]))
    # print(hits_score[0][0])
    hits = [corpusId2nodeId[hit['corpus_id']] for hit in
                    hits_score[0]]  # Get the hits for the first query
    scores = [hit['score'] for hit in hits_score[0]]
    hits.append(leaf_node)
    scores.append(2)
    scores_arr = np.array(scores)
    ind_parents = np.where(hits==parent[:,None])[1]
    ind_child = np.where(hits==children[:,None])[1]
    scores_2darray = np.append([scores_arr[ind_parents]],[scores_arr[ind_child]],axis=0).T
    args_leaf = np.where(scores_2darray[:,1]==2)
    scores_2darray[args_leaf,1] = scores_2darray[args_leaf,0]
    # for id_x,x in enumerate(query_pred):
    #     scores_2darray[np.where(edges_2darray==x)]=scores_pred[id_x]
    scores_mean = scores_2darray.mean(axis=1)
    sorting_args = np.argsort(scores_mean)[::-1]
    edges_prediction.append(edges_2darray[sorting_args,:])
    all_edges_scores.append(scores_mean[sorting_args])
    all_predictions.append(hits)
    all_scores.append(scores)
    

In [None]:
print(len(all_targets[1]))

In [None]:
print(type(edges_prediction[0]))

In [None]:
all_target = all_targets
pred_pos = edges_prediction


In [None]:
pred_pos_np = np.array(pred_pos)
pred_pos_np.shape

In [None]:
all_ranks = []
leaf_ranks = []
non_leaf_ranks = []
for idx, target_parents in enumerate(all_target):
    flag = True
    for pos in target_parents:
        if pos[1] != leaf_node:
            flag = False
            break
    for (parent,child) in target_parents:
        ranks = []
        identify_idx = np.where((pred_pos_np[idx] == (parent,child)).all(axis=1))[0]
        if len(identify_idx)>0:
            posIdx = identify_idx[0]
        else:
            posIdx = np.where(pred_pos_np[idx] == (parent,child))[0][0]
        rank = posIdx + 1
        ranks.append(rank)
    all_ranks.append(ranks)
    if flag:
        leaf_ranks.append(ranks)
    else:
        non_leaf_ranks.append(ranks)
# all_ranks = []
# pred_pos_np = np.array(pred_pos)
# for idx, target_parents in enumerate(all_target):
#     for (parent,child) in target_parents:
#         ranks = []
#         identify_idx = np.where((pred_pos_np[idx] == (parent,child)).all(axis=1))[0]
#         if len(identify_idx)>0:
#                 posIdx = identify_idx[0]
#         else:
#             posIdx = np.where(pred_pos_np[idx] == (parent,child))[0][0]
#         rank = posIdx + 1
#         ranks.append(rank)
#     all_ranks.append(ranks)


#### 下面的先不用

In [None]:
print(hits_score[0])

In [None]:
print(corpusId2nodeId[hits_score[0][0]['corpus_id']])

In [None]:
hits = [corpusId2nodeId[hit['corpus_id']] for hit in
                    hits_score[0]]  # Get the hits for the first query
scores = [hit['score'] for hit in hits_score[0]]

In [None]:
hits.append(leaf_node)
scores.append(2)

In [None]:
print(len(hits),len(scores))

In [None]:
scores_arr = np.array(scores)
print(scores_arr.shape)

In [None]:
print(edges_2darray.shape)

In [None]:
print(hits)
parent

In [None]:
ind_parents = np.where(hits==parent[:,None])

In [None]:
print(ind_parents[1])

In [None]:
ind_child = np.where(hits==children[:,None])

In [None]:
print(ind_child[1])

In [None]:
ind_parents = ind_parents[1]
ind_child = ind_child[1]


In [None]:
np.append([scores_arr[ind_parents]],[scores_arr[ind_child]],axis=0).shape

In [None]:
scores_2darray = np.append([scores_arr[ind_parents]],[scores_arr[ind_child]],axis=0).T
      

In [None]:
args_leaf = np.where(scores_2darray[:,1]==2)
            

In [None]:
print(args_leaf[0])

In [None]:
scores_2darray[args_leaf,1]

In [None]:
scores_2darray[args_leaf,1] = scores_2darray[args_leaf,0]

In [None]:
scores_mean = scores_2darray.mean(axis=1)

In [None]:
sorting_args = np.argsort(scores_mean)[::-1]

In [None]:
edges_prediction.append(edges_2darray[sorting_args,:])

In [1]:
a = [1,2,3]
b = [4,5,6]
tmp = {k:v for k,v in zip(a,b)}
tmp

{1: 4, 2: 5, 3: 6}

In [22]:
import pickle as pkl
with open('/codes/l/origin_taxocomplete-main/test_dict.pkl','rb') as f:
    dic = pkl.load(f)
dic

{802: {805: 0.5262390375137329,
  804: 0.5231996178627014,
  189: 0.44467097520828247,
  434: 0.4143221378326416,
  889: 0.41142040491104126,
  1393: 0.35933831334114075,
  719: 0.33768099546432495,
  763: 0.33744311332702637,
  1161: 0.335867702960968,
  1442: 0.3218303918838501,
  1349: 0.30516642332077026,
  360: 0.304296612739563,
  623: 0.3012291193008423,
  412: 0.2986292541027069,
  6: 0.28632616996765137,
  654: 0.2837243676185608,
  942: 0.2832934558391571,
  1414: 0.28307217359542847,
  1441: 0.2752276062965393,
  991: 0.2688276767730713,
  364: 0.26684680581092834,
  814: 0.2661438286304474,
  1084: 0.2549022436141968,
  994: 0.2548496723175049,
  959: 0.252655029296875,
  1252: 0.2497808337211609,
  461: 0.2484484612941742,
  160: 0.24832883477210999,
  520: 0.24828647077083588,
  544: 0.24535299837589264,
  600: 0.24437296390533447,
  362: 0.24373134970664978,
  1351: 0.2437140941619873,
  97: 0.23976348340511322,
  946: 0.23840564489364624,
  625: 0.23826389014720917,
  7

In [54]:
import numpy as np
a = np.array([[341,1487],[1044,1487],[667,1487]])
a

array([[ 341, 1487],
       [1044, 1487],
       [ 667, 1487]])

In [42]:
b = [[341,1487],[667,1487]]
c = [1,2]


In [43]:
result = [np.argwhere((a == x).all(axis=1)).flatten() for x in b]
result

[array([0]), array([2])]

In [40]:
import torch
d = torch.tensor([-100000] * 100,dtype=torch.float16)
d


tensor([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf], dtype=torch.float16)

In [44]:
c = torch.tensor(c, dtype=torch.float16)
c

tensor([1., 2.], dtype=torch.float16)

In [46]:
res_idx = torch.tensor(result).long().flatten()
res_idx

tensor([0, 2])

In [47]:
d[res_idx] = c
d

tensor([1., -inf, 2., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
       dtype=torch.float16)

In [52]:
h = np.array([[1,2],[3,4]])
h

array([[1, 2],
       [3, 4]])

In [53]:
h = h.tolist()
h

[[1, 2], [3, 4]]

In [2]:
import numpy as np

a = [[341,1487],[1044,1487],[667,1487]]
b = [[341,1487],[667,1487],[2,1]]

result = []
for item in b:
    if item in a:
        result.append(item)

print(result)

[[341, 1487], [667, 1487]]


In [20]:
a = [[341,1487],[1044,1487],[667,1487]]
b = []


In [21]:
if b == []:
    print("yes")
    b = [[]]
tmp = np.array([[x == y for x in a] for y in b]).any(0)
tmp

yes


array([False, False, False])

In [13]:
correct = np.where(tmp)[0]
correct

array([], dtype=int64)

In [14]:
incorrect = np.where(~tmp)[0]
incorrect

array([0, 1, 2])

In [15]:
import torch
labels = torch.cat((torch.ones(len(correct)), torch.zeros(len(incorrect)))).int()
labels

tensor([0, 0, 0], dtype=torch.int32)

In [17]:
a_1 = []
print(a_1)
if a_1 == []:
    print("yes")
    a_1 == [[]]
print(a_1)

[]
yes
[]


In [22]:
a = []
def f(a):
    a = [1]
    return a
b = f(a)
print(a)
print(b)

[]
[1]


In [25]:
a = np.array([14,3,1,5,10])
print(np.sum(a <= 3))

2
