## Try to download 1000 random pdfs from CORD19 and analyse failures

In [1]:
# Imports
from cord19_plus.downloadpdf.downloaders import Index, Status, IndexRow
from collections import Counter
from pathlib import Path
from typing import Union
import pickle
from urllib.parse import urlparse
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
# define paths
base_path = Path("")  # set path to 1000 random records
index_path = base_path / Path("index.jsonl")
download_dir_path = base_path / Path("pdfs")

In [None]:
# load the index
index = Index.from_jsonl(index_path, download_dir_path)

In [None]:
# number of downloaded PDFs
len(index)

In [None]:
# funtion for filtering the index obj by status
def filter_by_status(index: Index, status: Union[Status, list[Status]]) -> list[IndexRow]:
    if not isinstance(status, list):
        status = [status]

    status = set([s.value for s in status])
    return [entry for doi, entry in index.items() if entry.status in status]

In [None]:
downloaded = filter_by_status(index, Status.DOWNLOADED)

In [None]:
len(downloaded)

In [None]:
not_downloaded = filter_by_status(
    index, [Status.NOT_IN_OPENALEX, Status.NOT_OPEN_ACCESS, Status.RATE_LIMIT_ERROR, Status.UNAVAILABLE]
)
print(len(not_downloaded))
reasons = [Status(entry.status) for entry in not_downloaded]
counts = dict(Counter(reasons))

In [None]:
# counts statuses of not_downloaded pdfs
# most common reason is Status.UNAVAILABLE
counts

In [None]:
unavailable = [entry for entry in not_downloaded if entry.status == 0]

In [None]:
len(unavailable)

In [None]:
plt.figure(dpi=150)
plt.pie(
    [9 / 1000, 477 / 1000, 145 / 1000, 6 / 1000, 363 / 1000],
    (0, 0, 0, 0, 0),
    ["Connection Error", "Downloaded", "Not Open Access", "Not in OpenAlex", "Not a PDF"],
    startangle=20,
    autopct="%1.1f%%",
)
plt.show()

## Recrawled all unavailable urls to find last available url using script find_last_url.py
- 1 failed because of connection problems


In [None]:
pickle_dir = base_path / Path("pickles")
responses = []
for file in pickle_dir.iterdir():
    responses.append(pickle.load(open(file, "rb")))

Parse urls of last history url

In [None]:
parsed_last_url = []
for r in responses:
    if len(r.history) > 0:
        p_url = urlparse(r.history[-1].url)
    else:
        p_url = urlparse(r.url)
    parsed_last_url.append(p_url)

In [None]:
# extract netlocs to count website fails
netlocs = [pr.netloc for pr in parsed_last_url]

In [None]:
most_common_unavailable = Counter(netlocs).most_common(10)
print(most_common_unavailable)

Analyse status codes

In [None]:
status_codes = [r.status_code for r in responses]

In [None]:
# 403 Forbidden
# 200 OK
# 404 Not Found
status_counts = Counter(status_codes).most_common()
print(status_counts)

In [None]:
plt.figure(dpi=150)
plt.pie([s[1] for s in status_counts], labels=[f"HTTP({str(s[0])})" for s in status_counts], autopct="%1.1f%%")
plt.show()

In [None]:
# sort by status codes
status_dict = {}
for r in responses:
    if r.status_code not in status_dict.keys():
        status_dict[r.status_code] = []

    status_dict[r.status_code].append(r)

In [None]:
# parse urls
status_dict_url_parse = {}
for key in status_dict:
    for r in status_dict[key]:
        if key not in status_dict_url_parse.keys():
            status_dict_url_parse[key] = []
        p_url = urlparse(r.url)
        status_dict_url_parse[key].append(p_url)

In [None]:
for key in status_dict_url_parse:
    plt.figure(dpi=150)
    counts = Counter([r.netloc for r in status_dict_url_parse[key]]).most_common(10)
    print(key, counts)
    plt.bar([c[0] for c in counts], [c[1] for c in counts])
    plt.xticks(rotation=50, ha="right")
    plt.show()