In [1]:
import pandas as pd
from google.colab import drive
import os

In [None]:
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/MyDrive/IR_Project")

Mounted at /content/gdrive


In [None]:
df = pd.read_pickle("filtered_papers.pkl")
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,709.0355,Roland Bouffanais,"Nicolas Bodard, Roland Bouffanais, Michel O. D...",Solution of moving-boundary problems by the sp...,"Applied Numerical Mathematics, In Press, 2008",Applied Numerial Mathematics 58 (2008) 968-984,10.1016/j.apnum.2007.04.009,,"[cs.CE, cs.NA]",,This paper describes a novel numerical model...,"[{'version': 'v1', 'created': 'Tue, 4 Sep 2007...",2022-09-29,"[[Bodard, Nicolas, ], [Bouffanais, Roland, ], ..."
1,711.201,Reiner Czerwinski,Reiner Czerwinski,A Polynomial Time Algorithm for Graph Isomorphism,,,,,[cs.CC],http://creativecommons.org/licenses/by/4.0/,We claimed that there is a polynomial algori...,"[{'version': 'v1', 'created': 'Tue, 13 Nov 200...",2022-10-18,"[[Czerwinski, Reiner, ]]"
2,802.2345,Ioannis Chatzigeorgiou,"Ioannis Chatzigeorgiou, Ian J. Wassell and Rol...",On the Frame Error Rate of Transmission Scheme...,"5 pages, 4 figures, Proceedings of the 42nd Co...",,10.1109/CISS.2008.4558591,,"[cs.IT, math.IT]",http://arxiv.org/licenses/nonexclusive-distrib...,It is known that the frame error rate of tur...,"[{'version': 'v1', 'created': 'Sat, 16 Feb 200...",2022-03-08,"[[Chatzigeorgiou, Ioannis, ], [Wassell, Ian J...."
3,803.3946,Adam Smith,Shiva Prasad Kasiviswanathan and Adam Smith,On the `Semantics' of Differential Privacy: A ...,"Older version of this paper was titled: ""A Not...","Journal of Privacy and Confidentiality, 6 (1),...",10.29012/jpc.v6i1.634,,"[cs.CR, cs.DB]",http://arxiv.org/licenses/nonexclusive-distrib...,"Differential privacy is a definition of ""pri...","[{'version': 'v1', 'created': 'Thu, 27 Mar 200...",2023-01-24,"[[Kasiviswanathan, Shiva Prasad, ], [Smith, Ad..."
4,805.074,Luca Venturino,"Antonio De Maio, Marco Lops, Luca Venturino",Diversity-Integration Trade-offs in MIMO Detec...,,"IEEE Transactions on Signal Processing, Vol. 5...",10.1109/TSP.2008.928693,,"[cs.OH, cs.IT, math.IT]",http://arxiv.org/licenses/nonexclusive-distrib...,"In this work, a MIMO detection problem is co...","[{'version': 'v1', 'created': 'Tue, 6 May 2008...",2022-03-09,"[[De Maio, Antonio, ], [Lops, Marco, ], [Ventu..."


In [None]:
!pip install arxiv
!pip install pymupdf
!pip install langchain

Collecting arxiv
  Downloading arxiv-2.0.0-py3-none-any.whl (11 kB)
Collecting feedparser==6.0.10 (from arxiv)
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting sgmllib3k (from feedparser==6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=44d891028166e8f1218b7c74228e9a0d077478f64bead7d974b7a0c0a7b74a10
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-2.0.0 feedparser-6.0.10 sgmllib3k-1.0.0
Collecting pymupdf
  Downl

In [None]:
from langchain.document_loaders import ArxivLoader
# import os
# os.mkdir("arvix_dataset")

In [None]:
import time

def prepare_file(paper_id):
    docs = ArxivLoader(query=str(paper_id), load_max_docs=2).load()
    if docs[0].metadata and docs[0].page_content[:]:
        complete_file=str(docs[0].metadata)+"\n\n\n\n\n"+docs[0].page_content[:]
        return complete_file
    print(f"missing data for {paper_id}")

def process_file(paper_id, failed_retrieval:list):
    retries = 3
    while retries>0:
        try:
            with open(f'arvix_dataset/{paper_id}.txt', 'w') as f:
                f.write(prepare_file(paper_id))
            return
        except:
            time.sleep(1)
            retries-=1
    print(f"Failed to retrieve paper: {paper_id}")
    failed_retrieval.append(paper_id)

In [None]:
from tqdm import tqdm
import pickle
import glob

current_files = glob.glob("*.txt",root_dir="./arvix_dataset")
current_files = [x[:-4] for x in current_files]

last_paper = None
if os.path.exists("last_paper"):
    with open("last_paper", "r") as f:
        last_paper = f.read().strip()
print(f"Last Paper: {last_paper}")

failed_retrieval = []
if os.path.exists("failed_retrieval.pkl"):
    with open("failed_retrieval.pkl", "rb") as f:
        failed_retrieval = pickle.load(f)
print(f"Failed Retrieval: {failed_retrieval}")

paper_ids = sorted(df['id'].tolist())

start_idx = 0
if last_paper:
    try:
        start_idx = paper_ids.index(last_paper) + 1
    except ValueError:
        print("Last paper ID not found in the new list. Starting from the beginning.")
print(f"Starting from index: {start_idx}")
print(f"Paper ID: {paper_ids[start_idx]}")

papers_to_retrieve = sorted(list(set(paper_ids[start_idx:]) - set(current_files)))

for paper_id in tqdm(papers_to_retrieve):
    time.sleep(3)
    process_file(paper_id, failed_retrieval)
    last_paper = paper_id
    with open("last_paper", "w") as f:
        f.write(last_paper)
    with open("failed_retrieval.pkl", "wb") as f:
        pickle.dump(failed_retrieval, f)

Last Paper: 2005.12746
Failed Retrieval: ['1711.10534', '1711.10874', '1711.11158', '1712.05081', '1801.04544', '1802.03336', '1802.04944', '1803.02715', '1804.02820', '1806.09768', '1808.06324', '1809.00368', '1809.04441', '1810.10172', '1811.02117', '1811.02129', '1812.04406', '1812.04408', '1812.06663', '1812.07844', '1901.03123', '1901.07924', '1901.07935', '1901.09193', '1902.03820', '1902.06881', '1902.10630', '1903.10360', '1904.03139', '1904.07937', '1904.08396', '1904.09615', '1905.01626', '1905.03109', '1905.03527', '1905.03957', '1905.04235', '1905.04693', '1905.06540', '1906.06521', '1906.09689', '1907.01696', '1907.06441', '1907.08083', '1908.07956', '1908.09042', '1909.07457', '1909.08444', '1910.10294', '1911.01491', '1911.03089', '1911.03129', '1911.09034', '1911.11534', '1911.11582', '1912.05946', '1912.10298', '1912.10606', '1912.12871', '2001.01075', '2001.01717', '2001.02879', '2001.07592', '2001.08559', '2001.10616', '2002.00375', '2003.00835', '2003.11768', '2004.

  0%|          | 61/170275 [05:59<330:41:39,  6.99s/it]

Failed to retrieve paper: 2006.01029


  0%|          | 73/170275 [07:08<327:01:14,  6.92s/it]

Failed to retrieve paper: 2006.01605


  0%|          | 203/170275 [19:57<330:15:20,  6.99s/it]

Failed to retrieve paper: 2006.06119


  0%|          | 204/170275 [20:07<376:32:04,  7.97s/it]

Failed to retrieve paper: 2006.10125


  0%|          | 205/170275 [20:17<406:57:11,  8.61s/it]

Failed to retrieve paper: 2006.11798


  0%|          | 206/170275 [20:28<438:24:18,  9.28s/it]

Failed to retrieve paper: 2006.13844


  0%|          | 207/170275 [20:38<451:13:48,  9.55s/it]

Failed to retrieve paper: 2006.14788


  0%|          | 208/170275 [20:49<460:07:42,  9.74s/it]

Failed to retrieve paper: 2006.14945


  0%|          | 209/170275 [20:59<464:58:29,  9.84s/it]

Failed to retrieve paper: 2006.16645


  0%|          | 210/170275 [21:09<469:06:25,  9.93s/it]

Failed to retrieve paper: 2007.02719


  0%|          | 211/170275 [21:19<472:51:56, 10.01s/it]

Failed to retrieve paper: 2007.06156


  0%|          | 212/170275 [21:29<476:16:52, 10.08s/it]

Failed to retrieve paper: 2007.07066


  0%|          | 213/170275 [21:39<476:28:49, 10.09s/it]

Failed to retrieve paper: 2007.08284


  0%|          | 214/170275 [21:49<477:18:31, 10.10s/it]

Failed to retrieve paper: 2007.10826


  0%|          | 215/170275 [22:00<478:53:15, 10.14s/it]

Failed to retrieve paper: 2007.13140


  0%|          | 216/170275 [22:10<478:28:24, 10.13s/it]

Failed to retrieve paper: 2007.14245


  0%|          | 217/170275 [22:20<477:29:28, 10.11s/it]

Failed to retrieve paper: 2008.00397


  0%|          | 218/170275 [22:30<477:35:10, 10.11s/it]

Failed to retrieve paper: 2008.03226


  0%|          | 219/170275 [22:40<479:19:20, 10.15s/it]

Failed to retrieve paper: 2008.08612


  0%|          | 220/170275 [22:50<477:56:00, 10.12s/it]

Failed to retrieve paper: 2009.05131


  0%|          | 221/170275 [23:00<477:28:31, 10.11s/it]

Failed to retrieve paper: 2010.07532


  0%|          | 222/170275 [23:10<478:01:20, 10.12s/it]

Failed to retrieve paper: 2010.10658


  0%|          | 223/170275 [23:21<480:38:31, 10.18s/it]

Failed to retrieve paper: 2010.11869


  0%|          | 224/170275 [23:31<480:30:41, 10.17s/it]

Failed to retrieve paper: 2012.07731


  0%|          | 225/170275 [23:41<479:49:45, 10.16s/it]

Failed to retrieve paper: 2012.11355


  0%|          | 226/170275 [23:51<481:33:15, 10.19s/it]

Failed to retrieve paper: 2101.05612


  0%|          | 227/170275 [24:02<482:34:53, 10.22s/it]

Failed to retrieve paper: 2101.07624


  0%|          | 228/170275 [24:12<480:42:38, 10.18s/it]

Failed to retrieve paper: 2101.07842


  0%|          | 229/170275 [24:22<478:57:35, 10.14s/it]

Failed to retrieve paper: 2101.08547


  0%|          | 230/170275 [24:32<479:52:10, 10.16s/it]

Failed to retrieve paper: 2101.09933


  0%|          | 431/170275 [44:29<318:26:07,  6.75s/it]

Failed to retrieve paper: 2102.01724


  0%|          | 824/170275 [1:26:06<338:34:01,  7.19s/it]

Failed to retrieve paper: 2102.07684


  1%|          | 1028/170275 [1:46:40<409:23:26,  8.71s/it]

Failed to retrieve paper: 2102.10287


  1%|          | 1192/170275 [2:03:31<321:48:05,  6.85s/it]

Failed to retrieve paper: 2102.12736


  1%|          | 1245/170275 [2:09:00<327:10:17,  6.97s/it]

Failed to retrieve paper: 2102.13392


  1%|          | 1375/170275 [2:23:04<317:12:42,  6.76s/it]

Failed to retrieve paper: 2103.01612


  1%|          | 1420/170275 [2:28:03<363:09:40,  7.74s/it]

Failed to retrieve paper: 2103.02343


  1%|          | 1473/170275 [2:34:24<377:22:04,  8.05s/it]

Failed to retrieve paper: 2103.03102


  1%|          | 1576/170275 [2:45:21<384:12:12,  8.20s/it]

Failed to retrieve paper: 2103.04789


  1%|          | 1577/170275 [2:45:32<411:17:59,  8.78s/it]

Failed to retrieve paper: 2103.04794


  1%|          | 1764/170275 [3:04:49<344:09:26,  7.35s/it]

Failed to retrieve paper: 2103.07620


  1%|          | 1830/170275 [3:11:26<347:48:05,  7.43s/it]

Failed to retrieve paper: 2103.08743


  1%|          | 1875/170275 [3:15:57<326:13:53,  6.97s/it]

Failed to retrieve paper: 2103.09488


  1%|          | 1887/170275 [3:17:16<326:51:33,  6.99s/it]

Failed to retrieve paper: 2103.09696


  1%|          | 1895/170275 [3:18:05<339:09:30,  7.25s/it]

Failed to retrieve paper: 2103.09748


  1%|          | 1900/170275 [3:18:40<367:08:13,  7.85s/it]

Failed to retrieve paper: 2103.09865


  1%|          | 1948/170275 [3:23:42<353:30:30,  7.56s/it]

Failed to retrieve paper: 2103.10502


  1%|▏         | 2151/170275 [3:46:00<332:41:05,  7.12s/it]

Failed to retrieve paper: 2103.13342


  1%|▏         | 2165/170275 [3:47:39<327:11:13,  7.01s/it]

Failed to retrieve paper: 2103.13514


  1%|▏         | 2258/170275 [3:57:48<294:55:24,  6.32s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Failed to retrieve paper: 2103.14755
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-de3b0db5f749>", line 37, in <cell line: 33>
    with open("last_paper", "w") as f:
OSError: [Errno 107] Transport endpoint is not connected: 'last_paper'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_con

In [None]:
# import glob
# import ast
# from tqdm import tqdm

# file_path="./arvix_dataset"

# papers = []
# metadata = []
# failed = []
# for file in tqdm(glob.glob(f"{file_path}/*.txt")):
#     try:
#         with open(file, "r") as f:
#             data = f.read().split("\n\n\n\n\n")
#             meta = ast.literal_eval(data[0])
#             content = data[1]
#             papers.append(content)
#             metadata.append(meta)
#     except:
#         failed.append(file)

  1%|          | 144/13405 [04:58<80:29:15, 21.85s/it]

In [None]:
# failed

In [None]:
# import glob

# current_files = glob.glob("*.txt",root_dir="./arvix_dataset")
# len(current_files)

13405