In [None]:
!pip install PyPDF2
!pip install ntlk

### 1. Create Helper Function to Extract Text from PDF
- Use PyPDF to read PDF and extract text

In [22]:
import PyPDF2

def pdfToText(pdfFilePath, outputFilePath):
    print(f"Processing {pdfFilePath}")
    # Open the PDF file
    pdf_file = open(pdfFilePath, 'rb')

    # Create a PDF reader object
    reader = PyPDF2.PdfReader(pdf_file)

    # Get the number of pages in the PDF file
    num_pages = len(reader.pages)

    # Initialize a string variable to hold the text
    text = ""

    # Iterate through all the pages and extract text
    for page_num in range(num_pages):
        page_obj = reader.pages[page_num]
        text += page_obj.extract_text()

    # Close the PDF file
    pdf_file.close()

    # Open the file in write mode
    with open(outputFilePath, 'w') as f:
        # Write the text to the file
        f.write(text)

    # Print a message to confirm that the file has been saved
    print(f"Text saved to {outputFilePath}.")

### 2. Run the helper function on each ESG report and store them as a dataframe 
- Each row represent a sentence
- Save them as CSV files

In [23]:
import unicodedata
def remove_non_english_chars(text):
    # normalize text and check if ASCII value more than 128 (non-english words)
    return ''.join(c for c in unicodedata.normalize('NFD', text)
                   if unicodedata.category(c) != 'Mn' and
                   ord(c) < 128)

In [24]:
import os
import re
import nltk
import pandas as pd

input_folder = 'ESG_reports'

if not os.path.exists("pdf_text"):
    os.makedirs("pdf_text")

for filename in os.listdir(input_folder):
    if filename == ".DS_Store":
        continue
    input_filename = input_folder + "/" + filename
    output_folder = "pdf_text"
    output_filename = output_folder + "/" + filename.replace(".pdf", ".csv")
    pdfToText(input_filename, output_filename)  
    
    # read the text file
    with open(output_filename, 'r') as f:
        text = f.read()

    # clean the data
    text = remove_non_english_chars(text)
    text = re.sub(r'\n', ' ', text)  # remove line breaks
    text = re.sub(r'\s+', ' ', text)  # remove multiple spaces
    sentences = nltk.sent_tokenize(text) # split the text into sentences
    processed_sentences = []
    for sentence in sentences:
        sentence = re.sub(r"\d+\n", "", sentence) # Remove page number
        sentence = re.sub(r"\n", " ", sentence) # Remove line breaks 
        sentence = re.sub(r"\.(\s*\.){1,}", ". ", sentence) # Remove consecutive periods
        split_sentences = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)(?<!\d\.)\s*(?=[A-Z])", sentence) # split text into sentences
        
        split_sentences = [s for s in split_sentences if s != "" and s != "."] # do not include empty sentences or "."
        processed_sentences.extend(split_sentences)

    # create a dataframe
    df = pd.DataFrame({'sentences': processed_sentences})
    df.to_csv(output_filename, index=False)

Processing ESG_reports/NKE_2021.pdf
Text saved to pdf_text/NKE_2021.csv.
Processing ESG_reports/WFC_2022.pdf
Text saved to pdf_text/WFC_2022.csv.
Processing ESG_reports/QCOM_2022.pdf
Text saved to pdf_text/QCOM_2022.csv.
Processing ESG_reports/ABT_2021.pdf
Text saved to pdf_text/ABT_2021.csv.
Processing ESG_reports/BMY_2021.pdf
Text saved to pdf_text/BMY_2021.csv.
Processing ESG_reports/MCD_2020.pdf
Text saved to pdf_text/MCD_2020.csv.
Processing ESG_reports/DHR_2022.pdf
Text saved to pdf_text/DHR_2022.csv.
Processing ESG_reports/T_2022.pdf
Text saved to pdf_text/T_2022.csv.
Processing ESG_reports/PM_2022.pdf
Text saved to pdf_text/PM_2022.csv.
Processing ESG_reports/DIS_2022.pdf
Text saved to pdf_text/DIS_2022.csv.
Processing ESG_reports/UPS_2021.pdf
Text saved to pdf_text/UPS_2021.csv.
Processing ESG_reports/AMD_2021.pdf
Text saved to pdf_text/AMD_2021.csv.
Processing ESG_reports/ADBE_2021.pdf
Text saved to pdf_text/ADBE_2021.csv.
Processing ESG_reports/CSCO_2022.pdf
Text saved to pd