In [3]:
# https://github.com/kjanjua26/Bulk_CV_Paper_Downloader

import argparse
from glob import glob
from tqdm import tqdm
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import glob 
import pandas as pd 
import re

class CVPRDownloader:
    def __init__(self, year, download_path):
        self.url = 'http://openaccess.thecvf.com/CVPR{}.py'.format(year)
        print(self.url)
        self.year=year
        self.download_path=download_path
    
    def get_pdf(self):
        """
        CVPR2018 onwards segregates the paper listings in days first.
        """

        pdf_link = []
        r = requests.get(self.url)
        soup = BeautifulSoup(r.content, "html.parser")
        titles_ = soup.find_all('dt', attrs={'class':'ptitle'})
        pdfs_ = soup.find_all('dd')
        for i in pdfs_:
            date = i.text.split(' ')[-1].replace(']', '')
            if '[<a href' in str(i):
                link_ = str(i).split('\n')[1].split('=')[1].split(">")[0].replace('"','')
                comp_link = 'http://openaccess.thecvf.com/'+link_+'='+date
                pdf_link.append(comp_link)
        return pdf_link

    def download_file(self, download_url=None):
        """
        Download the papers in the specified folder.
        """

        paper_count = 0
        paper_links = []
        
        if download_url is None:
            r = requests.get(self.url)
        else:
            r = requests.get(download_url)
        
        soup = BeautifulSoup(r.content, "html.parser")
        pdfs_ = soup.find_all('dd')
        for ix in str(pdfs_).split('\n'):
            if '[<a href="content_CVPR_' or '[<a href="content_cvpr_' in ix:
                if 'papers' in ix:
                    paper_count += 1
                    link = ix.split('[<a href="')[-1].split('">')[0]
                    paper_link = 'https://openaccess.thecvf.com/' + link
                    print('Paper: ', paper_link)
                    title = link.split('/')[-1].split('.')[0].split('_CVPR_')[0]
                    try:
                        response = urlopen(paper_link)
                        file = open(self.download_path + "/{}.pdf".format(title), 'wb')
                    except:
                        print("Paper # {} - {} Failed".format(paper_count, title))

                    file.write(response.read())
                    file.close()
                    print("Paper # {} - {} Done".format(paper_count, title))

    def bulk_download(self):
        year_int = int(self.year)
        if year_int >= 2018:
            pdf_link = self.get_pdf()
            for xi in range(len(pdf_link)):
                self.download_file(pdf_link[xi])
        else:
            self.download_file()

for year in tqdm(range(1980, 2023)):
    downloader = CVPRDownloader(year, '/gscratch/prl/wagnew3/microsoft_academic_graph/cvpr_papers/')
    downloader.bulk_download()

  0%|                                                                                      | 0/43 [00:00<?, ?it/s]

http://openaccess.thecvf.com/CVPR1980.py


  2%|█▊                                                                            | 1/43 [00:00<00:19,  2.15it/s]

http://openaccess.thecvf.com/CVPR1981.py


  5%|███▋                                                                          | 2/43 [00:00<00:15,  2.68it/s]

http://openaccess.thecvf.com/CVPR1982.py


  7%|█████▍                                                                        | 3/43 [00:01<00:13,  2.90it/s]

http://openaccess.thecvf.com/CVPR1983.py


  9%|███████▎                                                                      | 4/43 [00:01<00:12,  3.02it/s]

http://openaccess.thecvf.com/CVPR1984.py


 12%|█████████                                                                     | 5/43 [00:01<00:12,  3.08it/s]

http://openaccess.thecvf.com/CVPR1985.py


 14%|██████████▉                                                                   | 6/43 [00:02<00:11,  3.12it/s]

http://openaccess.thecvf.com/CVPR1986.py


 16%|████████████▋                                                                 | 7/43 [00:02<00:11,  3.18it/s]

http://openaccess.thecvf.com/CVPR1987.py


 19%|██████████████▌                                                               | 8/43 [00:02<00:10,  3.21it/s]

http://openaccess.thecvf.com/CVPR1988.py


 21%|████████████████▎                                                             | 9/43 [00:02<00:10,  3.22it/s]

http://openaccess.thecvf.com/CVPR1989.py


 23%|█████████████████▉                                                           | 10/43 [00:03<00:10,  3.24it/s]

http://openaccess.thecvf.com/CVPR1990.py


 26%|███████████████████▋                                                         | 11/43 [00:03<00:09,  3.23it/s]

http://openaccess.thecvf.com/CVPR1991.py


 28%|█████████████████████▍                                                       | 12/43 [00:03<00:09,  3.23it/s]

http://openaccess.thecvf.com/CVPR1992.py


 30%|███████████████████████▎                                                     | 13/43 [00:04<00:09,  3.25it/s]

http://openaccess.thecvf.com/CVPR1993.py


 33%|█████████████████████████                                                    | 14/43 [00:04<00:08,  3.23it/s]

http://openaccess.thecvf.com/CVPR1994.py


 35%|██████████████████████████▊                                                  | 15/43 [00:04<00:08,  3.25it/s]

http://openaccess.thecvf.com/CVPR1995.py


 37%|████████████████████████████▋                                                | 16/43 [00:05<00:08,  3.24it/s]

http://openaccess.thecvf.com/CVPR1996.py


 40%|██████████████████████████████▍                                              | 17/43 [00:05<00:08,  3.23it/s]

http://openaccess.thecvf.com/CVPR1997.py


 42%|████████████████████████████████▏                                            | 18/43 [00:05<00:07,  3.22it/s]

http://openaccess.thecvf.com/CVPR1998.py


 44%|██████████████████████████████████                                           | 19/43 [00:06<00:07,  3.23it/s]

http://openaccess.thecvf.com/CVPR1999.py


 47%|███████████████████████████████████▊                                         | 20/43 [00:06<00:07,  3.23it/s]

http://openaccess.thecvf.com/CVPR2000.py


 49%|█████████████████████████████████████▌                                       | 21/43 [00:06<00:06,  3.23it/s]

http://openaccess.thecvf.com/CVPR2001.py


 51%|███████████████████████████████████████▍                                     | 22/43 [00:06<00:06,  3.22it/s]

http://openaccess.thecvf.com/CVPR2002.py


 53%|█████████████████████████████████████████▏                                   | 23/43 [00:07<00:06,  3.21it/s]

http://openaccess.thecvf.com/CVPR2003.py


 56%|██████████████████████████████████████████▉                                  | 24/43 [00:07<00:05,  3.23it/s]

http://openaccess.thecvf.com/CVPR2004.py


 58%|████████████████████████████████████████████▊                                | 25/43 [00:07<00:05,  3.22it/s]

http://openaccess.thecvf.com/CVPR2005.py


 60%|██████████████████████████████████████████████▌                              | 26/43 [00:08<00:05,  3.24it/s]

http://openaccess.thecvf.com/CVPR2006.py


 63%|████████████████████████████████████████████████▎                            | 27/43 [00:08<00:04,  3.23it/s]

http://openaccess.thecvf.com/CVPR2007.py


 65%|██████████████████████████████████████████████████▏                          | 28/43 [00:08<00:04,  3.24it/s]

http://openaccess.thecvf.com/CVPR2008.py


 67%|███████████████████████████████████████████████████▉                         | 29/43 [00:09<00:04,  3.26it/s]

http://openaccess.thecvf.com/CVPR2009.py


 70%|█████████████████████████████████████████████████████▋                       | 30/43 [00:09<00:03,  3.27it/s]

http://openaccess.thecvf.com/CVPR2010.py


 72%|███████████████████████████████████████████████████████▌                     | 31/43 [00:09<00:03,  3.28it/s]

http://openaccess.thecvf.com/CVPR2011.py


 74%|█████████████████████████████████████████████████████████▎                   | 32/43 [00:10<00:03,  3.27it/s]

http://openaccess.thecvf.com/CVPR2012.py


 77%|███████████████████████████████████████████████████████████                  | 33/43 [00:10<00:03,  3.26it/s]

http://openaccess.thecvf.com/CVPR2013.py
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Kim_Deformable_Spatial_Pyramid_2013_CVPR_paper.pdf
Paper # 1 - Kim_Deformable_Spatial_Pyramid_2013 Done
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Sholomon_A_Genetic_Algorithm-Based_2013_CVPR_paper.pdf
Paper # 2 - Sholomon_A_Genetic_Algorithm-Based_2013 Done
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Li_Exploring_Compositional_High_2013_CVPR_paper.pdf
Paper # 3 - Li_Exploring_Compositional_High_2013 Done
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Shi_Hyperbolic_Harmonic_Mapping_2013_CVPR_paper.pdf
Paper # 4 - Shi_Hyperbolic_Harmonic_Mapping_2013 Done
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Garg_Dense_Variational_Reconstruction_2013_CVPR_paper.pdf
Paper # 5 - Garg_Dense_Variational_Reconstruction_2013 Done
Paper:  https://openaccess.thecvf.com/content_cvpr_2013/papers/Takeda_Fusing_Depth_from_2013_CVPR_

 77%|███████████████████████████████████████████████████████████                  | 33/43 [01:30<00:27,  2.75s/it]

Paper # 120 - Xu_Incorporating_User_Interaction_2013 Failed





ValueError: write to closed file

In [None]:
from bs4 import BeautifulSoup
import requests
from seleniumrequests import Firefox

# Simple usage with built-in WebDrivers:

# def get_paper(paper_name):
paper_name="Imagenet: A large-scale hierarchical image database"
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
search_paper_name=paper_name.replace('+', '%2B').replace('%', '%25').replace('&', '&26').replace(':', '%3A').replace(' ', '%20')
url = f'https://www.semanticscholar.org/search?q={search_paper_name}&sort=relevance'
print(url)
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.content,'lxml')
i = soup.findAll(text=True)
print(i)
item=soup.find_all("div", class_="cl-paper-row serp-papers__paper-row paper-row-normal")

    # ref=item.select('[data-clk-atid]')
    # name=ref[1].text
    # url=ref[0].attrs['href']
    # print(url)
    # if paper_name.lower()!=name.lower():
    #     print(f'names dont match! {paper_name.lower()} {name.lower()}')
    

# paper_name="Imagenet: A large-scale hierarchical image database"
# get_paper(paper_name)

In [None]:
from stem import Signal
from stem.control import Controller

with Controller.from_port(port = 9051) as controller:
    controller.authenticate(password='your password set for tor controller port in torrc')
    print("Success!")
    controller.signal(Signal.NEWNYM)
    print("New Tor connection processed")

In [None]:
import pypdfium2 as pdfium
import requests
from tika import parser
from PyPDF2 import PdfReader

paper_name="Imagenet: A large-scale hierarchical image database"
url = 'https://ieeexplore.ieee.org/iel5/5191365/5206488/05206848.pdf'
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
response = requests.get(url,headers=headers)

file_name=f'/gscratch/prl/wagnew3/microsoft_academic_graph/scratch/{paper_name}.pdf'
print(file_name)
pdf = open(file_name, 'wb')
pdf.write(response.content)
print(len(response.content))
pdf.close()

EOF_MARKER = b'%%EOF'

with open(file_name, 'rb') as f:
    contents = f.read()
    # check if EOF is somewhere else in the file
    if EOF_MARKER in contents:
        # we can remove the early %%EOF and put it at the end of the file
        contents = contents.replace(EOF_MARKER, b'')
        contents = contents + EOF_MARKER
    else:
        # Some files really don't have an EOF marker
        # In this case it helped to manually review the end of the file
        print(contents[-8:]) # see last characters at the end of the file
        # printed b'\n%%EO%E'
        contents = contents[:-6] + EOF_MARKER
    with open(file_name.replace('.pdf', '') + '_fixed.pdf', 'wb') as f:
        f.write(contents)

fixed_name=file_name.replace('.pdf', '') + '_fixed.pdf'
reader = PdfReader(fixed_name)
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

print(text)