<a href="https://colab.research.google.com/github/tylerdq/notebooks/blob/master/pdfgrep_gdrive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plot key terms in PDF file(s) from a Google Drive folder

In [0]:
#@title Set up
#@markdown 1. Execute this cell
#@markdown 2. Follow authentication link & instructions
#@markdown 3. Click folder icon in left sidebar
#@markdown 4. Navigate to the location under "drive" folder with your PDFs
#@markdown 5. Right-click the folder, click "Copy path", paste when requested
!sudo apt install pdfgrep
import subprocess, os, glob, csv
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
path = input('Please enter the path with the PDFs you would like to search: ')
os.chdir(path)
print('Thank you, please proceed to the next cell')

In [0]:
#@title Term and page count for a given search phrase in all PDFs
#@markdown *Note: First time is slow due to caching, subsequent runs are faster*
term = "" #@param {type:"string"}
pg_result = subprocess.check_output(['pdfgrep', '-i', term, '-p', '--cache']
                                    + glob.glob('*pdf')).decode('utf-8')
ct_result = subprocess.check_output(['pdfgrep', '-i', term, '-c', '--cache']
                                    + glob.glob('*pdf')).decode('utf-8')
pg_data = csv.reader(pg_result.splitlines(), delimiter=':')
ct_data = csv.reader(ct_result.splitlines(), delimiter=':')
pg_df = pd.DataFrame(data=pg_data, dtype=int, columns=['file', 'page', 'count'])
ct_df = pd.DataFrame(data=ct_data, dtype=int, columns=['file', 'count'])
pg_df['page'] = pg_df['page'].astype(str).astype(int)
pg_df['count'] = pg_df['count'].astype(str).astype(int)
ct_df['count'] = ct_df['count'].astype(str).astype(int)
pg_df = pg_df.groupby('file').size().to_frame('pages').reset_index()
df = pg_df.merge(ct_df)
count = df.plot.bar(x='file', y='count', rot=90, figsize=(12, 6))
pages = df.plot.bar(x='file', y='pages', rot=90, figsize=(12, 6))

In [0]:
#@title Term count for each page with matches in a single PDF
filename = "" #@param {type:"string"}
term = "" #@param {type:"string"}
one_result = subprocess.check_output(['pdfgrep', '-i', term, '-p', '--cache',
                                      filename]).decode('utf-8')
data = csv.reader(one_result.splitlines(), delimiter=':')
df = pd.DataFrame(data=data, dtype=int, columns=['page', 'count'])
df['page'] = df['page'].astype(str).astype(int)
df['count'] = df['count'].astype(str).astype(int)
ax = df.plot.bar(x='page', y='count', rot=90, figsize=(24, 6))