In [1]:
import json
from statextract.helpers import fetch_work
with open('sampled_work_ids_med.txt', 'r') as f:
    sampled_work_ids = f.read().splitlines()

import random
rng = random.Random(42)
# # take 200 random works
# sampled_work_ids = rng.sample(sampled_work_ids, 500)
# sampled_work_ids = sampled_work_ids

rng.shuffle(sampled_work_ids)

works_fetched = [fetch_work(w) for w in sampled_work_ids]

In [None]:
import asyncio
from statextract.fetchers.fetchers import CachingPaperFetcher, CombinedPaperFetcher
from statextract.fetchers.lol_fetcher import LibraryLolFetcher
from statextract.fetchers.openalex_fetcher import OpenAlexFetcher
from statextract.md_retriever import parse_work
from statextract.typedefs import PaperMD
from pathlib import Path
from statextract.get_dois import extract_text


fetcher = CachingPaperFetcher(CombinedPaperFetcher([OpenAlexFetcher(), LibraryLolFetcher() ]))
# fetcher = CombinedPaperFetcher([OpenAlexFetcher(), LibraryLolFetcher()])
# mds = get_all_mds('A5072310807', first_author=True)

def try_parse_work(w):
    try:
        return parse_work(w)
    except Exception as e:
        print(e)
        return None

mds = [w for w in [try_parse_work(w) for w in works_fetched] if w is not None]

    # for md in mds:
    #     print(md.id)
    #     print(md.title)
    #     print(md.doi)
    #     print(md.full['type'])
    #     print()

    # print(len(mds))
    # exit()


# dois = [md.doi for md in mds]
sem = asyncio.Semaphore(3)
async def fetch_pdf(md: PaperMD):
    async with sem:
        return (md, await fetcher.fetch(md))

pdf_urls = await asyncio.gather(*[fetch_pdf(md) for md in mds])

In [None]:
len (pdf_urls)

In [17]:


from statextract.helpers import form_path_base


mds = [md for (md, _) in pdf_urls if "10.3390_bs9010011" not in str(form_path_base(md)) and "10.1037_pspi0000235" not in str(form_path_base(md))]

print(len(mds))

extraction_status = extract_text(mds, Path("data/pdfs"), Path("data/mds"), Path("data/images"), concurrent=3)

mds_successful = [md for (md, success) in zip(mds, extraction_status) if success]
print(len(mds_successful))

968
467


In [18]:
import pymupdf

from statextract.helpers import form_path_base

def get_num_pages(md: PaperMD):
    pdf_url = Path("data/pdfs") / f"{form_path_base(md)}.pdf"
    pdf = pymupdf.open(pdf_url)
    return pdf.page_count

short_mds = [md for md in mds_successful if get_num_pages(md) <= 15]

print(len(short_mds), len(mds_successful))

442 467


In [21]:
# dump dois
with open('short_mds_ids.json', 'w') as f:
    json.dump([md.id for md in short_mds], f)


In [None]:
from statextract.helpers import form_path_base
from statextract.prefilter import prefilter_regex


sem_prefilter = asyncio.Semaphore(3)
async def pvalue_prefilter(md: PaperMD, text_path: Path):
    async with sem_prefilter:
        if not text_path.exists():
            return None
        text = text_path.read_text()
        first_pass = prefilter_regex(text)
        return first_pass
        # return False
        

# with multiprocess.Pool(3) as p:
#     res = list(p.starmap(pvalue_prefilter, [(md, Path("data/mds") / f"{form_path_base(md)}.md") for md in mds_successful]))
# mds_successful = [md for md in mds_successful if md.title == 'Learning to administrate, administrating to learn.']


res = await asyncio.gather(*[pvalue_prefilter(md, Path("data/mds") / f"{form_path_base(md)}.md") for md in mds_successful])
mds_successful = [md for (md, success) in zip(mds_successful, res) if success]

len(mds_successful)
    

In [None]:
WSL_PATH_BASE = Path("wsl.localhost/Ubuntu-22.04")

paths = [WSL_PATH_BASE / "mnt" / "storage" / "python" / "pcurves" / "data" / "pdfs" / f"{form_path_base(md)}.pdf" for md in mds_successful]

for p in paths:
    print(f"file://{p}")
