In [None]:
import pyterrier as pt
import pandas as pd
if not pt.started():
   pt.init()

import json
import requests
import os
import pickle

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')

In [None]:
qrels = dataset.get_qrels()
rel_doc_nos = qrels['docno'].unique().tolist()
len(rel_doc_nos)

In [None]:
#metadata = pd.read_csv("/workspaces/CORD19_Plus/data/metadata.csv")

In [None]:
#metadata_filtered = metadata[metadata['cord_uid'].isin(rel_doc_nos)]
#save filtered metadata to a csv file
#metadata_filtered.to_csv('/workspaces/CORD19_Plus/data/rel_metadata.csv', index=False)
metadata_filtered = pd.read_csv('/workspaces/CORD19_Plus/data/rel_metadata.csv')

In [None]:
#get current publications

def load_jsonl_to_dataframe(file_path):
    # Initialize an empty list to collect JSON objects
    data_list = []
    
    # Open the JSON Lines file and read line by line
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Parse each line as JSON and append it to the list
            data_list.append(json.loads(line.strip()))
    
    # Convert the list of JSON objects into a DataFrame
    df = pd.DataFrame(data_list)
    
    return df

def extract_pdf_url(res):
    res_dict = res.json()

    for _, val in res_dict.items():
        if isinstance(val, dict):
            for k,v in val.items():
                if k == "url_for_pdf":
                    return v
    return None

def build_api_url_call(doi, email="unpaywall_01@example.com"):
    return f"https://api.unpaywall.org/v2/{doi}?email={email}"


# Function to handle each request and extract the PDF URL
def fetch_pdf_url(missing_doi):
    res = requests.get(build_api_url_call(missing_doi))
    pdf_url = extract_pdf_url(res)
    if pdf_url:
        return missing_doi, pdf_url
    return missing_doi, None

def append_to_jsonl(file_path, data):
    """
    Appends a dictionary as a new line to a JSONL file.

    :param file_path: The path to the JSONL file.
    :param data: A dictionary representing the row you want to append.
                 It should be in the same format as the existing JSONL entries.
    """
    
    with open(file_path, 'a', encoding='utf-8') as file:
        # Convert the dictionary to a JSON string and append it to the file
        file.write(json.dumps(data) + '\n')

In [None]:
df = load_jsonl_to_dataframe("/workspaces/CORD19_Plus/data/index.jsonl")
avail_ids = df['key'].unique().tolist()

In [None]:
len(avail_ids)

In [None]:
len(set(rel_doc_nos))

In [None]:
14704/37924

In [None]:
missing_ids = set(rel_doc_nos).difference(set(avail_ids))
len(missing_ids)

In [None]:
missing_dois = metadata_filtered[metadata_filtered['cord_uid'].isin(missing_ids)]['doi'].unique().tolist()
doi2cord_uid_map = {} 
len(missing_dois)

In [None]:
for missing_doi in tqdm(missing_dois):
    try:
        doi2cord_uid_map[missing_doi] = metadata_filtered[metadata_filtered['doi'] == missing_doi]['cord_uid'].iloc[0]
    except:
        print(missing_doi)

In [None]:
pdf_urls = {}
max_workers = 25

# Use ThreadPoolExecutor to run the requests in parallel
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Create a dictionary to store the future object and the corresponding missing_id
    future_to_id = {executor.submit(fetch_pdf_url, missing_doi): missing_doi for missing_doi in missing_dois}
    
    # Iterate over completed futures and update the progress bar
    for future in tqdm(as_completed(future_to_id), total=len(missing_dois)):
        missing_doi, pdf_url = future.result()
        if pdf_url:
            pdf_urls[missing_doi] = pdf_url


In [None]:
#pickle.dump(pdf_urls, open("/workspaces/CORD19_Plus/data/next_pdf_urls.pkl", "wb"))

In [None]:
urls = pickle.load(open("/workspaces/CORD19_Plus/data/next_pdf_urls.pkl", "rb"))
len(urls)

In [None]:
def download_mising_pdf(doi, url, doi2cord_uid_map, path_to_save = "/workspaces/CORD19_Plus/data/pdfs"):
    filename = path_to_save + "/" + doi2cord_uid_map[doi] + ".pdf"
    try:
        if not os.path.exists(filename):
            response = requests.get(url)
            with open(filename, "wb") as f:
                f.write(response.content)
    except:
        print(f"{doi2cord_uid_map[doi]}")

In [None]:
#now download those missing pdfs and update index
import os

max_workers = 15

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Create a dictionary to store the future object and the corresponding missing_id
    future_to_id = {executor.submit(download_mising_pdf, doi, url, doi2cord_uid_map): doi for doi, url in urls.items()}
    
    # Iterate over completed futures and update the progress bar
    for future in tqdm(as_completed(future_to_id), total=len(urls)):
        _ = future.result()


In [None]:
with tqdm(total = len(urls)) as pbar:
    for key, val in urls.items():
        data = {}
        data['status'] = 1  
        data['key'] = doi2cord_uid_map[key]
        data['pdf_path'] = f"{data['key']}.pdf"
        data['pdf_url'] = val

        append_to_jsonl("/workspaces/CORD19_Plus/data/index.jsonl",data)
        pbar.update(1)