In [32]:
import pandas as pd
import chromadb
import argparse
import pickle
from tqdm import tqdm
import os
import sys
import math
import ast
import numpy as np

In [48]:
sys.path.append("../")

from workloads.inference import infer, create_id_to_abstract_dict, create_paper_id_to_title_dict
from utils.build_graph import build_graph
from workloads.compute_metrics import *


In [49]:
graph_path = "../data/graph.pickle"
print("Building graph...")


filtered_data_path = "../data/filtered_data.pickle"

with open(filtered_data_path, "rb") as f:
    filtered_data = pickle.load(f)

graph = build_graph(filtered_data)

with open(graph_path, "wb") as f:
    pickle.dump(graph, f)


Building graph...


In [57]:
id2abstract_dict = create_id_to_abstract_dict(filtered_data)
id2title_dict = create_paper_id_to_title_dict(filtered_data)

In [51]:
result_df = infer(
    "../data/chroma_dbs/",
    graph,
    "../workloads/workload.csv",
    "arxiv_vector",
    "abstracts",
    id2abstract_dict,
    10
)

100%|███████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.39s/it]
100%|███████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.58s/it]
100%|███████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.24s/it]


In [52]:
result_df.head(10)

Unnamed: 0,paper_id,query,arxiv_vector,abstracts,hybrid
0,2109.01682,find papers from year 2022 published at < J. H...,"[1808.10567, 1012.0224, 1908.10767, 2203.0543,...","[2109.01682, 2003.12792, 2012.07797, 906.5435,...","[1808.10567, 1012.0224, 1908.10767, 2203.0543,..."
1,2109.01682,find papers written by < WenHanChiu > publishe...,"[1611.06118, physics/0003022, 2111.0216, 2002....","[2109.01682, 2003.12792, 2012.07797, 906.5435,...","[1611.06118, physics/0003022, 2111.0216, 2002...."
2,2109.01682,find papers from year 2022 about < results hig...,"[1808.10567, 2203.0543, 1908.10767, 1012.0224,...","[2109.01682, 2003.12792, 2012.07797, 906.5435,...","[1808.10567, 2203.0543, 1908.10767, 1012.0224,..."
3,2007.04365,find papers from year 2020 on < physics.flu-dy...,"[1810.11788, math-ph/0410044, 1803.02354, 2107...","[2007.04365, 2101.10059, 1712.00943, 2212.1376...","[1810.11788, math-ph/0410044, 1803.02354, 2107..."
4,2007.04365,find papers written by < GeorgA.MensahandJonas...,"[2007.04365, 803.2199, 2001.08987, 2212.13765,...","[2007.04365, 2101.10059, 1712.00943, 2212.1376...","[2007.04365, 803.2199, 2001.08987, 2212.13765,..."
5,2007.04365,find papers written by < CamiloF.Silva > on < ...,"[1803.02354, 2107.07588, 2207.04871, 2003.1401...","[2007.04365, 2101.10059, 1712.00943, 2212.1376...","[1803.02354, 2107.07588, 2207.04871, 2003.1401..."
6,2204.10933,find papers written by < AsafCidonandJunfengYa...,"[2204.02828, 1310.4904, 1905.10022, astro-ph/0...","[2204.10933, 2002.0792, 1906.03444, 1806.11146...","[2204.02828, 1310.4904, 1905.10022, astro-ph/0..."
7,2204.10933,find papers from year 2022,"[2307.04285, astro-ph/0207672, 706.0015, 1905....","[2204.10933, 2002.0792, 1906.03444, 1806.11146...","[2307.04285, astro-ph/0207672, 706.0015, 1905...."
8,2204.10933,find papers written by < ChengzhiMao > publish...,"[2204.02828, hep-ph/0301030, hep-ph/0301029, 2...","[2204.10933, 2002.0792, 1906.03444, 1806.11146...","[2204.02828, hep-ph/0301030, hep-ph/0301029, 2..."
9,2310.08105,find papers from year 2023 published at < Natu...,"[2203.0543, 706.0015, 2108.13751, 2212.04214, ...","[2310.08105, 2210.17438, 2208.10828, 2106.0794...","[2203.0543, 706.0015, 2108.13751, 2212.04214, ..."


In [53]:
results = pd.read_csv('../workloads/res.csv')

In [54]:
results.head()

Unnamed: 0.1,Unnamed: 0,paper_id,query,arxiv_vector,abstracts,hybrid
0,0,2109.01682,find papers from year 2022 published at < J. H...,"['1808.10567', '1012.0224', '1908.10767', '200...","['2109.01682', '2003.12792', '2012.07797', '90...","['1808.10567', '1012.0224', '1908.10767', '200..."
1,1,2109.01682,find papers written by < WenHanChiu > publishe...,"['1808.10567', '1908.10767', 'physics/0003022'...","['2109.01682', '2003.12792', '2012.07797', '90...","['1808.10567', '1908.10767', 'physics/0003022'..."
2,2,2109.01682,find papers from year 2022 about < results hig...,"['1808.10567', '2203.0543', '1908.10767', '101...","['2109.01682', '2003.12792', '2012.07797', '90...","['1808.10567', '2203.0543', '1908.10767', '101..."
3,3,2007.04365,find papers from year 2020 on < physics.flu-dy...,"['1810.11788', 'math-ph/0410044', '1803.02354'...","['2007.04365', '2101.10059', '1712.00943', '22...","['1810.11788', 'math-ph/0410044', '1803.02354'..."
4,4,2007.04365,find papers written by < GeorgA.MensahandJonas...,"['2007.04365', '803.2199', '2001.08987', '2212...","['2007.04365', '2101.10059', '1712.00943', '22...","['2007.04365', '803.2199', '2001.08987', '2212..."


In [55]:
results['arxiv_vector'] = results['arxiv_vector'].apply(ast.literal_eval).tolist()
results['abstracts'] = results['abstracts'].apply(ast.literal_eval).tolist()
results['hybrid'] = results['hybrid'].apply(ast.literal_eval).tolist()

arxiv_vector_ids = results['arxiv_vector'].iloc[0]
abstract_ids = results['abstracts'].iloc[0]
hybrid_ids = results['hybrid'].iloc[0]




print(arxiv_vector_ids)
print(abstract_ids)
print(hybrid_ids)

['1808.10567', '1012.0224', '1908.10767', '2005.1203', '2203.0543', 'hep-ph/0301030', '2108.13751', '2105.12475', 'hep-ph/0301029', '2011.0355']
['2109.01682', '2003.12792', '2012.07797', '906.5435', '1704.08493', '1812.01594', '2102.02278', '2305.00989', '2208.08432', '2202.13943']
['1808.10567', '1012.0224', '1908.10767', '2005.1203', '2203.0543', '1709.08833', '2107.02977', '2112.02013', '2305.1558', '2109.01038']


In [56]:
for i in range(results.shape[0]):
    arxiv_vector_ids = results['arxiv_vector'].iloc[i]
    abstract_ids = results['abstracts'].iloc[i]
    hybrid_ids = results['hybrid'].iloc[i]
    print(compute_accuracy(abstract_ids, arxiv_vector_ids), compute_accuracy(abstract_ids, hybrid_ids))
    

0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.2 0.2
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.1
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.1
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.1
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.1
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.1 0.1
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.2 0.2
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0


In [27]:
print(compute_percent_include(abstract_ids))

In [64]:
id2title_dict[hybrid_ids[1]]

'A comprehensive review on topological superconducting materials and\n  interfaces'