In [22]:
from pyspark.sql import SparkSession
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json
import os

In [7]:
spark = SparkSession.builder.appName('SparkApp').config('spark.driver.host', 'localhost').getOrCreate()

In [3]:
'''
Takes in a list of PDF file paths as input
and returns the loaded json as a python dictionary
'''
def read_esg_jsons(lst_of_json_path):
    res = []
    for json_path in lst_of_json_path:
        
        with open(json_path, 'r') as f:
            res.append(json.load(f))
    return res

'''
Takes in a dictionary with
<source_path>_<page_id> of PDF as key
and page text in string as value,
chunksize, chunk_overlap and separators
as inputs and returns a dictionary as an output
through the following process:

Splits each page into smaller tokens
and returns a dictionary where
the key is <source_path>_<page_id>_<token_num>
and value as token in string format
'''
def generate_page_id_num_tokenstr_pair(
        pageid_doc_d, 
        chunksize = 200, 
        chunk_overlap = 20, 
        separators = ["\n\n", "\n", " ", ""]
    ):
    # initialise text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunksize,
        chunk_overlap = chunk_overlap,
        separators = separators
    )
    '''
    Splits each page into smaller tokens
    and returns a dictionary where
    the key is <source_path>_<page_id>_<token_num>
    and value as token in string format
    '''
    pageidnum_tokenstr_d = {f'{pageid}_{idx}': tokenstr\
                            for pageid, pagestring in pageid_doc_d.items()\
                            for idx, tokenstr in enumerate(text_splitter.split_text(pagestring))}
    return pageidnum_tokenstr_d

In [25]:
json_folder_dir = 'data/esg-json/'
lst_of_json_filename = os.listdir(json_folder_dir)
lst_of_json_paths = [json_folder_dir + json_filename for json_filename in lst_of_json_filename]
lst_of_jsons = read_esg_jsons(lst_of_json_paths)
lst_of_tokenized_jsons = [
    generate_page_id_num_tokenstr_pair(loaded_json, 200, 10, ["\n\n", "\n", " ", ""])
    for loaded_json in lst_of_jsons
]

In [26]:
lst_of_tokenized_jsons

[{'data/esg-pdf/2023-saudi-aramco-sustainability-report-full-en.pdf_0_0': 'Aramco\nSustainability Report 2023\nInvesting in growth \nInnovating for sustainability',
  'data/esg-pdf/2023-saudi-aramco-sustainability-report-full-en.pdf_1_0': 'ARAMCO | SUSTAINABILITY REPORT 2023\n01\nAbout this report\nWe are Aramco, one \nof the world’s largest \nintegrated energy and \nchemicals companies\nOur vision',
  'data/esg-pdf/2023-saudi-aramco-sustainability-report-full-en.pdf_1_1': 'Aramco’s vision is to be the world’s preeminent \nintegrated energy and chemicals company, operating \nin a safe, sustainable, and reliable manner.\nOur mission',
  'data/esg-pdf/2023-saudi-aramco-sustainability-report-full-en.pdf_1_2': 'Aramco strives to provide reliable, affordable, and \nmore sustainable energy to communities around the \nworld, and to deliver value to its shareholders through',
  'data/esg-pdf/2023-saudi-aramco-sustainability-report-full-en.pdf_1_3': 'business cycles by maintaining its preeminen