In [1]:
!pip install pdfminer3

Collecting pdfminer3
  Downloading pdfminer3-2018.12.3.0.tar.gz (5.0 MB)
[K     |████████████████████████████████| 5.0 MB 8.3 MB/s 
[?25hCollecting pycryptodome
  Downloading pycryptodome-3.14.1-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 42.8 MB/s 
Building wheels for collected packages: pdfminer3
  Building wheel for pdfminer3 (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer3: filename=pdfminer3-2018.12.3.0-py3-none-any.whl size=117822 sha256=fde62cb07ace5d54887313ab7d7a35199ee82191f93ad5c6f3c43b2fbca78b9e
  Stored in directory: /root/.cache/pip/wheels/f6/1b/21/339d1825e274c4a9829233a986f93dcedb98913f98e85b2916
Successfully built pdfminer3
Installing collected packages: pycryptodome, pdfminer3
Successfully installed pdfminer3-2018.12.3.0 pycryptodome-3.14.1


### Using PDF Miner library I converted all the data from the pdf to textual data in the form of string. We can use this string data to query the CPT codes from it

In [4]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open('/content/UHC-Commercial-Advance-Notification-Prior-Authorization-Requirements-10-1-2021.pdf', 'rb') as fh:

    for page in PDFPage.get_pages(fh,
                                  caching=True,
                                  check_extractable=True):
        page_interpreter.process_page(page)

    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()



### We will do some data cleaning, first replacing the "*" in the data with space so that we can easily get CPT codes which have asterick in it. Also we need to remove the footers that are in each page. Since the footer consist of the number 2020. We don't want our code to consider it as a CPT code

In [5]:
text=text.replace("*"," ")
text=text.replace('CPT® is a registered trademark of the American Medical Association.' ,'')
text=text.replace('PCA-1-20-01515-Clinical-WEB_05262020' ,'')
text=text.replace('© 2020 United HealthCare Services, Inc.' ,'')


### Here's the preview of the textual data in the file

In [6]:
print(text[0:3000])

Prior Authorization Requirements  
for UnitedHealthcare 
Effective Oct. 1, 2021 

 

General Information 
This list contains notification/prior authorization review requirements for care providers who participate with United 
Healthcare Commercial for inpatient and outpatient services, as referenced in the 2021 UnitedHealthcare Care 
Provider Administrative Guide 

Specific state rules may apply. For more information on whether authorization is required or not, please go to 
UHCprovider.com and click on the UnitedHealthcare Provider Portal button in the top right corner. Then, select 
the Prior Authorization and Notification tool tile on your Provider Portal dashboard. 
This list changes periodically. Updates are announced routinely in the UnitedHealthcare Network Bulletin. If 
viewing a printed copy, please visit UHCprovider.com/priorauth > Advance Notification and Plan Requirement 
Resources > Select a Plan Type for the most current information. 
To provide notification/request prior

### Here I am using regular expressions to generate a particular CPT code. After analyzing the document, I found that there are many CPT codes of different variation. And few of them are not even numeric.

### I am generating three columns in my dataframe.
### 1. All the numeric CPT codes which requires prior authorization
### 2. All the non-numeric which is the combination of the codes which requires prior authorization
### 3. All the codes which do not require prior authorization
### 4. And finally all the CPT codes in the document


In [8]:
import re
import pandas as pd
extracted_code=[]
prior_auth_req=[]
main=text.split()
numeric_only=[]
alpha_numeric_only=[]
no_auth=[]

for i in range(0,len(main)):

  # no_auth list has all the codes which do not require prior authorization
  if main[i]=='Notification/prior' and main[i+1]=='authorization' and main[i+2]=='not':
    start=main.index(main[i+9])
    end=main.index('Z42.1')
    pattern1=re.compile(r'^[a-zA-Z].{1,4}[0-9]+$')

    for j in range(start,end+1):
      if pattern1.search(main[j]):
        no_auth.append(main[j])

  #numeric_only and alpha numeric_only are the lists which consist all the CPT codes which require prior authorization.
  elif main[i]=='Prior':
    
    start=main.index(main[i+3])

    end=len(main)

    pattern2=re.compile(r'^[a-zA-Z]{1,4}[0-9]+$')
    for j in range(start,end):
      if main[j].isdigit():
        numeric_only.append(main[j])
     
      elif pattern2.search(main[j]):
        alpha_numeric_only.append(main[j])

  else:
    pass

#The below for loop stores all the kind of CPT codes in the extracted_code list
for i in range(0,len(main)):
  if main[i]=='Notification/prior' and main[i+1]=='authorization' and main[i+2]=='not':
    start=main.index(main[i+9])
    end=main.index('Z42.1')
    pattern1=re.compile(r'^[a-zA-Z].{1,4}[0-9]+$')

    for j in range(start,end+1):
      if pattern1.search(main[j]):
        extracted_code.append(main[j])


  elif main[i]=='Prior':
    
    start=main.index(main[i+3])

    end=len(main)

    pattern2=re.compile(r'^[a-zA-Z]{1,4}[0-9]+$')
    for j in range(start,end):
      if main[j].isdigit():
        extracted_code.append(main[j])
     
      elif pattern2.search(main[j]):
        extracted_code.append(main[j])

  else:
    pass


#Finally creating a dataframe of all the codes
data=pd.DataFrame(numeric_only,columns=['Numeric CPT Codes that require PA'])
data['Alpha_Numeric CPT codes that require PA']=pd.Series(alpha_numeric_only)
data['No Authorization Required CPT Codes']=pd.Series(no_auth)
data['All Extracted CPT Codes']=pd.Series(extracted_code)
data

Unnamed: 0,Numeric CPT Codes that require PA,Alpha_Numeric CPT codes that require PA,No Authorization Required CPT Codes,All Extracted CPT Codes
0,2021,L8600,C50.019,2021
1,2021,Q5120,C50.112,2021
2,23473,Q5122,C50.219,23473
3,24362,J1454,C50.411,24362
4,27120,J2469,C50.512,27120
...,...,...,...,...
216715,33979,,,41105
216716,33929,,,41116
216717,33981,,,41825
216718,33975,,,42107


### Saving the file into CSV

In [12]:
data.to_csv('/content/final_dataframe_of_CPT_codes.csv')

### Based on my observation few codes in the breast reconstruction does not require a prior authorization. Hence I decided to have a separate column for it. There might be multiple ways in which we can have more filtered results, like based on the procedure name e.g Arthoplasty we only generate the CPT codes for that procedure only, that can also be done, studying the location in it and also deciding a terminating index can work to get such customised CPT codes.