# Scraped `.pdf` to `.txt` Speed Test

1. Loop
2. List Comprehension

In [1]:
import sys
print(sys.version)

3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 21:00:12) [Clang 16.0.6 ]


In [2]:
import time as tm
import datetime as dt
from datetime import datetime # .now
program_begin = datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
print(program_begin)

2024-03-04, 00:45:51


In [3]:
import os
import numpy as np
import pandas as pd
from PyPDF2 import PdfReader

In [4]:
path = 'iv'
print(os.path.exists(path)) # confirm the existence of immigrant visa folder

True


In [5]:
# be aware of the .DS_Store that was automatically generated
# additional work is required to filter the actual document list
sorted(os.listdir(path))[:5]

['.DS_Store',
 'iv_2017-03-31.pdf',
 'iv_2017-04-30.pdf',
 'iv_2017-05-31.pdf',
 'iv_2017-06-30.pdf']

In [6]:
# `os.getcwd() + '/' + path`
# `sorted(os.listdir(path))` is to make sure files start from 2017 and end with the most recent month
full_names = [os.getcwd() + '/' + path + '/' + pdf_file 
              for pdf_file in sorted(os.listdir(path)) 
              if pdf_file.startswith('iv_') and pdf_file.endswith('.pdf')]
print(f'The first file is:\n{full_names[0]}')
print(f'The last file is:\n{full_names[-1]}')

The first file is:
/Users/tiangeng/Documents/python_files/iv/iv_2017-03-31.pdf
The last file is:
/Users/tiangeng/Documents/python_files/iv/iv_2024-01-31.pdf


In [7]:
public_data_folder = r'/Users/tiangeng/Public/data'
path_txt = 'ivtxt'

In [8]:
def dtime(file):
    return datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d, %A, %H:%M:%S")    

In [9]:
if not os.path.exists(public_data_folder + '/'+ path_txt):
    print('Create a new folder to store the converted .txt files.')
    os.makedirs(public_data_folder + '/'+ path_txt)
else:
    print(f'The folder exists, it was created at:\n{dtime(public_data_folder + '/'+ path_txt)}')            

The folder exists, it was created at:
2024-03-03, Sunday, 22:38:49


In [10]:
txt_names = [pdf.split('/')[-1].split('.')[0] + '.txt' for pdf in full_names]
print(f'The first .txt filename is:\n{txt_names[0]}\nThere\'re {len(txt_names)} .txt files.')

The first .txt filename is:
iv_2017-03-31.txt
There're 83 .txt files.


# `.pdf` to `.txt` Conversion

In [11]:
pdf_readers_list = [PdfReader(pdf) for pdf in full_names]
print(pdf_readers_list[0])

<PyPDF2._reader.PdfReader object at 0x159cc56d0>


# List Comprehension

In [12]:
print(datetime.now().strftime("%Y-%m-%d, %H:%M:%S"))
start_time = datetime.now()
test_list = [reader.pages[pg].extract_text() 
             for reader in pdf_readers_list 
             for pg in range(len(reader.pages))]
end_time = datetime.now()
print(datetime.now().strftime("%Y-%m-%d, %H:%M:%S"))
f'The list comprehension takes: {round((end_time-start_time).total_seconds())} seconds.'

2024-03-04, 00:45:54
2024-03-04, 00:46:42


'The list comprehension takes: 48 seconds.'

In [13]:
len(test_list)

3946

In [14]:
test_list[-1]

'Foreign State of Chargeability                                                                  \nor Place of Birth Visa Class IssuancesImmigrant Visa Issuances                                                                                                           \nby Foreign State of Chargeability                                                              \nor Place of Birth                                                                                           \nJanuary 2024 (FY 2024)\nYemen FX 18                         \nYemen IR1 162                       \nYemen IR2 85                         \nYemen IR5 69                         \nYemen IW 1                           \nZambia CR1 3                           \nZambia E3 7                           \nZambia F4 2                           \nZambia FX 1                           \nZambia I5 4                           \nZambia IR1 3                           \nZambia IR2 4                           \nZambia IR5 2           

In [15]:
n_pages = [len(reader.pages) for reader in pdf_readers_list]
print(n_pages)

[60, 65, 68, 67, 62, 62, 52, 71, 68, 67, 65, 67, 67, 70, 71, 70, 68, 68, 57, 69, 66, 66, 65, 65, 64, 69, 68, 68, 69, 57, 58, 68, 65, 65, 63, 59, 52, 13, 6, 8, 15, 17, 28, 20, 21, 23, 22, 20, 28, 35, 37, 40, 41, 41, 42, 38, 38, 39, 38, 40, 41, 40, 39, 41, 38, 42, 39, 40, 40, 38, 39, 39, 42, 38, 40, 39, 39, 35, 37, 38, 37, 37, 37]


**Cut list into uneven groups using** `islice` in `itertools`

In [16]:
print(datetime.now().strftime("%Y-%m-%d, %H:%M:%S"))
from itertools import islice
it = iter(test_list)
sliced = [list(islice(it, 0, i)) for i in n_pages]
print(datetime.now().strftime("%Y-%m-%d, %H:%M:%S"))

2024-03-04, 00:46:42
2024-03-04, 00:46:42


## Loop

In [17]:
new_TEXT = [None] * len(pdf_readers_list)
loop_start = datetime.now() 
for i in range(len(pdf_readers_list)): # i loops over 83 pdf documents
    new_TEXT[i] = [pg.extract_text() for pg in [pdf.pages for pdf in pdf_readers_list][i]]
    if i%10 == 0:
        print(f'Iter #{i+1}: {datetime.now().strftime("%Y-%m-%d, %H:%M:%S")}') 
loop_end = datetime.now()  

Iter #1: 2024-03-04, 00:46:43
Iter #11: 2024-03-04, 00:46:49
Iter #21: 2024-03-04, 00:46:56
Iter #31: 2024-03-04, 00:47:03
Iter #41: 2024-03-04, 00:47:07
Iter #51: 2024-03-04, 00:47:11
Iter #61: 2024-03-04, 00:47:16
Iter #71: 2024-03-04, 00:47:22
Iter #81: 2024-03-04, 00:47:28


In [18]:
f'The loop takes: {round((loop_end-loop_start).total_seconds())} seconds.'

'The loop takes: 47 seconds.'

In [19]:
print(len(new_TEXT)) # number of files
print(len(new_TEXT[-1])) # number of pages in the last file

83
37


# Save `.txt` to File

In [None]:
%%timeit
for i, txt in enumerate(txt_names):
    file = open(public_data_folder + '/'+ path_txt + '/' + txt, 'w')
    for each_page in new_TEXT[i]:
        file.write(each_page + "\n")
    file.close()