In [None]:
# !pip install pymupdf
# !pip install pdfplumber - this one is for tables, but I couldn't get it to work properly

In [1]:
import fitz
import pandas as pd
from pathlib import Path
import re
from collections import defaultdict
from fuzzywuzzy import fuzz, process



In [2]:
#Set folder path for pdf file
preprocessing_folder = Path.cwd().parent.parent
pdf_path = Path.joinpath(preprocessing_folder, "raw", "Handbook_for_Solar_PV.pdf") 

In [3]:
pdf = fitz.open(pdf_path)

In [4]:
#Params
contents_page_num = 1
main_page_start = 5
main_page_end = 31

# 1. Main document

### Obtain list of chapters and subheaders from contents page (to verify later that we have convered all sections)
Saved as dictionaries of lists

In [5]:
#Extract contents page
contents_page = pdf[contents_page_num]

#Initiate 2 defaultdicts
chapters_subheaders1 = defaultdict(list)
subheaders1_2 = defaultdict(list)

## Cleaning ##
#Split items, remove \n and non-header items
contents = contents_page.get_text().replace("�", " ")
contents_lst = re.split(r"\n\d", contents)
contents_lst = [re.sub(r"\n", "", x).strip() for x in contents_lst]
contents_lst = [x for x in contents_lst if len(x) > 1 and x != "Contents"]

#Categorise items
for s in contents_lst:
    
    #Strip all non-alphabets, with some exceptions
    s_alpha = re.sub(r'[^a-zA-Z\s()“”–]', '', s).strip()
    
    #If line starts with . - subheader1
    if s.startswith("."):
        chapter = list(chapters_subheaders1.keys())[-1]
        chapters_subheaders1[chapter].append(s_alpha)
        subheaders1_2[s_alpha] = []
        
    #If bullet is in line - subheader2
    elif "•" in s:
        subheader1 = list(subheaders1_2.keys())[-1]
        subheaders1_2[subheader1].append(s_alpha)
    
    #Otherwise chapter name
    else:
        chapters_subheaders1[s_alpha] = []

### Extract info from main document
1. Obtain chapter names and subheaders
2. Clean chapter names - first page of each chapter tends to have incomplete chapter names due to formatting

In [26]:
def headers_lines_main(pdf_main):

    """
    Function to extract chapter name and subheaders for each line of text in document.
    
    Arguments:
        pdf_main: pdf document opened using pymupdf
    Returns:
        List of tuples; each line of text is represented as a tuple (chapter, subheader1, subheader2, text)
    
    """
    
    #Initiate empty list
    lines_tagged_lst = []
    
    #Initiate chapters and subheaders as none
    curr_chapter = None
    curr_subheader1 = None
    curr_subheader2 = None

    for page in pdf_main:
        blocks = page.get_text("dict")["blocks"]
        #iterate through the text blocks
        for b in blocks: 
            #this block contains text
            if b['type'] == 0:  
                #iterate through the text lines
                for l in b["lines"]:  
                    #iterate through the text spans
                    for s in l["spans"]:  
                        
                        text = s["text"].strip().replace("�", " ")
                        
                        #skip figure captions and chapter headers
                        if text.startswith(("figure", "Chapter")) or (text == ""):
                            pass
                        else:
                            #update current chapter if font size is large and we have reached a new chapter
                            if s["size"] > 10:
                                if curr_chapter != text:
                                    curr_chapter = text
                                    curr_subheader2 = None #reset subheader2
                            #update subheader 1 if italic
                            elif s["font"] == "Univers-BoldOblique":
                                curr_subheader1 = text
                                curr_subheader2 = None #reset subheader2
                            #update subheader 2 if same font as regular text but different color
                            elif (s["font"] == 'Univers-Light') & (s["color"] != 0): 
                                curr_subheader2 = text

                            lines_tagged_lst.append((curr_chapter, curr_subheader1, curr_subheader2, text))

    return lines_tagged_lst

In [27]:
pdf_main = pdf.pages(5, 31)
headers_lines_m = headers_lines_main(pdf_main)
headers_lines_df = pd.DataFrame(headers_lines_m, columns = ['Chapter', 'Subheader1', 'Subheader2', 'Text'])

In [30]:
#Drop irrelevant rows
def to_drop(x):
    if any([x['Text'] == x['Chapter'], 
             x['Text'] == x['Subheader1'],
             x['Text'] == x['Subheader2']]):
        return 1
    elif len(x['Text']) < 2:
        return 1
    else:
        return 0

headers_lines_df["drop"] = headers_lines_df.apply(lambda x: to_drop(x), axis = 1)
headers_lines_df = headers_lines_df[headers_lines_df["drop"] == 0]
headers_lines_df = headers_lines_df.drop(columns = ["drop"])

In [31]:
#List of chapter names from contents page, vs those parsed from main doc
chapters = list(chapters_subheaders1.keys())
chapters_doc = list(headers_lines_df['Chapter'].apply(lambda x: x.lower()).unique())

In [32]:
#Fuzzy matching to obtain full chapter name
response = []
for name_to_find in chapters_doc:
    resp_match =  process.extractOne(name_to_find, chapters)
    if resp_match[1] > 50:
         row = {'original_name':name_to_find,'matched_name':resp_match[0], 'score':resp_match[1]}
         response.append(row)

results = pd.DataFrame(response)

results

Unnamed: 0,original_name,matched_name,score
0,an overview,Solar Photovoltaic (“PV”) Systems – An Overview,90
1,solar photovoltaic (“pv”) systems – an overview,Solar Photovoltaic (“PV”) Systems – An Overview,100
2,on a building,Solar PV Systems on a Building,90
3,solar pv systems on a building,Solar PV Systems on a Building,100
4,contractor,Appointing a Solar PV System Contractor,90
5,appointing a solar pv system contractor,Appointing a Solar PV System Contractor,100
6,requirements,Solar PV System Installation Requirements,90
7,solar pv system installation requirements,Solar PV System Installation Requirements,100
8,installing a solar pv system,Solar Photovoltaic (“PV”) Systems – An Overview,86
9,maintenance,Operations and Maintenance,90


In [33]:
#Merge in correct chapter names
headers_lines_df['Chapter'] = headers_lines_df['Chapter'].apply(lambda x: x.lower())
headers_lines_df = headers_lines_df.merge(results[["original_name", "matched_name"]], 
                                          left_on = "Chapter", 
                                          right_on = "original_name",
                                          how = "left")

In [34]:
#Dropped rows with no matched names (only 3 rows)
headers_lines_df_clean = headers_lines_df[~headers_lines_df["matched_name"].isnull()].drop(columns = ["Chapter"])
headers_lines_df_clean = headers_lines_df[['matched_name', 
                                           'Subheader1', 
                                           'Subheader2', 
                                           'Text']].rename(columns = {"matched_name":"Chapter"})

In [44]:
#Fill empty subheader2
headers_lines_df_clean["Subheader2"] = headers_lines_df_clean["Subheader2"].fillna("No subheader2")

In [46]:
headers_lines_df_clean.groupby(['Chapter', "Subheader1", "Subheader2"]).agg({'Text': ' '.join})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Text
Chapter,Subheader1,Subheader2,Unnamed: 3_level_1
Appointing a Solar PV System Contractor,Getting Started,Choosing between bids,If there are several bids for the installation...
Appointing a Solar PV System Contractor,Getting Started,Get an experienced and licensed contractor,Experience in installing grid-connected solar ...
Appointing a Solar PV System Contractor,Getting Started,No subheader2,"First, compile a list of potential solar..."
Appointing a Solar PV System Contractor,Getting Started,Other relevant matters,Another matter to be aware of is that PV modul...
Appointing a Solar PV System Contractor,Getting Started,Regular maintenance,During the defect liability period (usually fo...
Appointing a Solar PV System Contractor,Getting Started,Solar PV system warranty,A solar PV system is an investment that should...
Appointing a Solar PV System Contractor,Introduction,No subheader2,You will need to select a contractor to instal...
Operations and Maintenance,Operations of Solar PV Systems,No subheader2,The most practical indicator of the performanc...
Operations and Maintenance,Recommended Preventive Maintenance Works,No subheader2,It is recommended that preventive inspection a...
Solar PV System Installation Requirements,Application of Electrical Installation Licence,No subheader2,Your LEW will be able to advise you whether yo...


### To do/try:
- extract text from relevant appendices
- try other q&a generators?

1. https://huggingface.co/iarfmoose/t5-base-question-generator
2. https://github.com/AMontgomerie/question_generator
3. https://github.com/topics/question-generator
4. https://github.com/ramsrigouthamg/Questgen.ai
5. https://towardsdatascience.com/questgen-an-open-source-nlp-library-for-question-generation-algorithms-1e18067fcdc6

### Failed attempt at extracting tables :'(

In [None]:
import pdfplumber
import pandas as pd

# Open the PDF file using pdfplumber
with pdfplumber.open("C:/Users/Zhong Xuean/Documents/dsaid-hackathon23-illuminati/data/preprocessing/raw/Handbook_for_Solar_PV.pdf") as pdf:
 # Iterate through all the pages in the PDF
 for page in pdf.pages:
     # Extract the table from the page
     table = page.extract_table()
     # Convert the table into a pandas DataFrame
     df = pd.DataFrame(table[1:], columns=table[0])
     # Print the DataFrame
     print(df)


In [None]:
pdf = pdfplumber.open("C:/Users/Zhong Xuean/Documents/dsaid-hackathon23-illuminati/data/preprocessing/raw/Handbook_for_Solar_PV.pdf")

In [None]:
table_settings = {
    "vertical_strategy": "text",
    "horizontal_strategy": "lines"
}
table=pdf.pages[10].extract_table(table_settings)
pd.DataFrame(table[0::],columns=table[0])