<a href="https://colab.research.google.com/github/tasinfrancesco/soc_sci_code/blob/main/PDF_read_iterator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import glob
docs = glob.glob("SocSciPDFs/document_*")
for doc in docs:
  os.remove(doc)

In [16]:
"""
This section implements two different methods to download pdfs from a link.

1) the first method (copy-pasted from Stack Overflow) appeads in download pdfs and only works for response = 200 (successful response)
   meaning that it won't work for response = 403 requests (i.e., request understood, but doesn't have the authorisation).

2) for response = 403, the download_pdfs function calls on async_download_pdfs, which implements asynchronous running of commands (probably not the
   right way to call this) to allow a firefox crawler to access the page and wait for it to load before downloading the pdf file.
   That's why we the "install-deps" call at the beginning, to ensure the correct version of firefox is available for python.
"""
# !pip install playwright
# !playwright install
# !playwright install-deps
# !pip install nest_asyncio

import urllib.request
import os
import time
import random
import requests
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()
async def async_download_pdfs(pdf_url, filename):
      async with async_playwright() as p:
            #We define the browser, the context and the page
            browser = await p.firefox.launch(headless=True)
            context = await browser.new_context()
            page = await browser.new_page()
            async with page.expect_download() as download_info:
               #If we navigate without try, it will throw an exception and it will stop our script, so, we wrap it inside a try except block
              try:
                await page.goto(pdf_url)
                print("all good")
              except:
                # From my understaning the previous command fails because it ecceeds the default 3000ms timeout threshold. The file is still downloaded though.
                # I'm reporting the name of the file just so people can later check it was downloaded correctly
                print(f"Slower download for {filename}")
              download = await download_info.value
              await download.save_as(filename)
              await context.close()
              await browser.close()
              return 1, filepath

def download_pdfs(pdf_url, output_dir, filenum):
    # Track successful and failed downloads. THIS FUNCTION WAS JUST TAKEN FROM STACK OVERFLOW!!!
    successful = 0
    failed = 0
    error_403 = 0

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Create a filename from the URL
    filename = os.path.join(output_dir, f"document_{filenum}.pdf")

    # If URL has a filename, use that instead

    try:
        # Download with stream=True for larger files
        response = requests.get(pdf_url, stream=True)

        if response.status_code == 200:
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            print(f"Downloaded: {filename}")

        elif response.status_code == 403:
          loop = asyncio.get_event_loop()
          successful, filename = loop.run_until_complete(async_download_pdfs(pdf_url, filename))
          print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download {pdf_url}: HTTP {response.status_code}")

        # Be nice to the server - add some delay
        time.sleep(random.uniform(1.0, 3.0))

    except Exception as e:
        print(f"Error downloading {pdf_url}: {str(e)}")

    return successful, filename


class URL_iterator():
  #returns iteratively the next file from the URL list (as an open file). It still requires file.read() to actually read it
  def __init__(self, URL_list, output_dir):
    self.url_list = URL_list
    self.idx = 0
    self.output_dir = output_dir

  def __iter__(self):
    return self

  def __next__(self):
    if self.idx > len(self.url_list):
      raise StopIteration
    else:
      successful, filepath = download_pdfs(self.url_list[self.idx], self.output_dir, self.idx)
      self.idx += 1
      return successful, filepath



if __name__ == "__main__":
  url = "http://www.nature.com:80/polopoly_fs/1.20821!/menu/main/topColumns/topLeftColumn/pdf/538291a.pdf"
  urls = ["http://www.nature.com:80/polopoly_fs/1.20821!/menu/main/topColumns/topLeftColumn/pdf/538291a.pdf", "https://www.frontiersin.org/articles/10.3389/fpsyg.2023.1258721/pdf?isPublishedV2=False", "http://www.jaad.org/article/S0190962223009933/pdf" ]
  down_pdfs = URL_iterator(urls, "SocSciPDFs")

  pdf_filepaths = []
  for i in range(3):
    success, filepath = next(down_pdfs)
    if success:
      pdf_filepaths.append(filepath)


Downloaded: SocSciPDFs/document_0.pdf
Downloaded: SocSciPDFs/document_1.pdf
Slower download for SocSciPDFs/document_2.pdf
Downloaded: SocSciPDFs/document_1.pdf


In [37]:
"""
Here is the PDF to txt section
"""
# !pip install pdf-comments-extractor
!pip install --upgrade pdf-comments-extractor
from pdf_comments_extractor import AcademicPaperExtractor

extractor = AcademicPaperExtractor("SocSciPDFs/document_0.pdf")
data = extractor.extract_sections()
print(data)



NameError: name 'Dict' is not defined

In [36]:
import sys
for s in (sys.path):
  print(s)

/content
/env/python
/usr/lib/python312.zip
/usr/lib/python3.12
/usr/lib/python3.12/lib-dynload

/usr/local/lib/python3.12/dist-packages
/usr/lib/python3/dist-packages
/usr/local/lib/python3.12/dist-packages/IPython/extensions
/root/.ipython


In [None]:
# BASE CODE FOR ASYNCIO IMPLEMENTATION

# import asyncio
# from playwright.async_api import async_playwright

# p = await async_playwright().start()

# browser = await p.firefox.launch(headless=True)  # Set True to hide browser
# page = await browser.new_page()
# await page.goto("http://www.jaad.org/article/S0190962223009933/pdf")

# # Wait for PDF to load
# page.wait_for_timeout(3000)

# # Save the PDF
# pdf_content = page.pdf()
# with open("article.pdf", "wb") as f:
#     f.write(pdf_content)

# browser.close()


Error: Page.goto: Download is starting
Call log:
  - navigating to "http://www.jaad.org/article/S0190962223009933/pdf", waiting until "load"
