In [4]:
import PyPDF2
import re
import pandas as pd

def extract_text_from_pdf(pdf_path):
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        # Initialize PDF reader
        pdf_reader = PyPDF2.PdfFileReader(file)
        text = ""

        # Iterate through all the pages
        for page_num in range(pdf_reader.getNumPages()):
            # Extract text from each page
            page = pdf_reader.getPage(page_num)
            text += page.extract_text()

    return text

pdf_path = 'MO_CODES_Numerical_20191119.pdf'


In [5]:
# Extract text from MO Codes document
extracted_text = extract_text_from_pdf(pdf_path)

# Use regex to find all code entries (4 digits, a space, a capital letter, and any further text)
matches = re.findall('\d{4} [A-Z].+', extracted_text)

# Some lines had extra text lumped onto to the end from another part of the PDF, so this cleans that up
matches = [re.sub('SUSPICIOUS.+|NUMERIC.+|MO CODE.+|TRAFFIC.+|', '', x).strip() for x in matches]

# Preview the matches
matches[:5]

['0100 Suspect Impersonate',
 '0101 Aid victim',
 '0102 Blind',
 '0103 Physically disabled',
 '0104 Customer']

In [6]:
# Separate codes from descriptions
# Get all 4-digit numbers
codes = [re.search('^\d{4}', x)[0] for x in matches]
# Get all text that comes after a 4-digit number and a space
descriptions = [re.search('(?<=\d{4} ).+', x)[0] for x in matches]

# Make a dataframe of it
moCodes_df = pd.DataFrame({'code': codes, 'description': descriptions})
moCodes_df

Unnamed: 0,code,description
0,0100,Suspect Impersonate
1,0101,Aid victim
2,0102,Blind
3,0103,Physically disabled
4,0104,Customer
...,...,...
817,4021,T/C - Topanga
818,4024,T/C - Central Traffic (CTD)
819,4025,T/C - South Traffic (STD)
820,4026,T/C - Valley Traffic (VTD)


In [7]:
# Save as CSV in the project
moCodes_df.to_csv('moCodes_csv.csv', index=False)