#### References
* https://www.sbert.net/examples/applications/image-search/README.html


In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from nltk.tokenize import sent_tokenize
import pandas as pd
from typing import Union
import pdfplumber
import fitz
import glob, sys
import os
from glob import glob
import re
from collections import OrderedDict
import requests
import pickle
from pdf2image import convert_from_path
from pdf2image import convert_from_bytes

In [2]:
pdf_folder_path = f'../data/genai_poc/'
os.listdir(pdf_folder_path)

['processed', 'raw', '.ipynb_checkpoints']

In [258]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) \
[for fn in os.listdir(pdf_folder_path)  if fn not in ['AI_POC_Papers_Oct_11.zip']]

In [7]:
all_pdfs = os.listdir("../data/genai_poc/raw/")

In [8]:
all_pdfs = [i for i in all_pdfs if i.endswith('.pdf')]

In [9]:
pdf_id_name_dict = {i:all_pdfs[i] for i in range(0,len(all_pdfs))}

In [10]:
pdf_id_name_dict

{0: 'influence of defects on mech behavior of Cu SAC305 Cu joint.pdf',
 1: 'coefficient extraction of sac305 using equation informed NNs.pdf',
 2: 'influence of flux and related factors on IMC growth in SAC305 solder joints.pdf',
 3: 'effect of microalloying and surface finish on thermal cycling SAC305.pdf',
 4: 'modeling of aged SAC305 Bi joints under various cycling conditions.pdf',
 5: 'effect of ageing on mech properties of SAC305.pdf',
 6: 'reliability modeling of fatigue lige sac305 using arrhenius model.pdf'}

In [3]:
class pdf_extraction_utils():
    
    def __init__(self,pdf_path):
        self.pdf_data_path = pdf_path
        
    def read_pdf(self):
        '''
        Read PDF using Langchain Unstructured PDF Loader api
        '''
        pdf_loader = UnstructuredPDFLoader(self.pdf_data_path, mode="elements")
        documents = pdf_loader.load()
        return documents
    
    def extract_data(self, extraction_category: str = 'NarrativeText'):
        '''
        Extract the raw texts of different categories from the passed pdf
        '''
        documents = self.read_pdf()
        
        all_cats = []
        for doc in documents:
            all_cats.append(doc.metadata['category'])
        
        lis = []
        for i, doc in enumerate(documents):
            if doc.metadata['category'] == extraction_category: 
                lis.append([doc.metadata['page_number'],doc.page_content])
        
        
        check = 1
        temp = []
        temp_data_list = []
        for item in lis:
            if item[0] == check:
                temp.append(item[1])
            else:
                temp_data_list.append([check,"\n\n".join(temp)])
                check += 1
                temp = []
                temp.append(item[1])
                
        df = pd.DataFrame(temp_data_list,columns=["page","raw_texts_"+extraction_category])
        return df

In [14]:
pdf_data = pdf_extraction_utils('../data/genai_poc/raw/coefficient extraction of sac305 using equation informed NNs.pdf')

In [17]:
df = pdf_data.extract_data(extraction_category='NarrativeText')

In [166]:
class MultiPDFExtraction():
    
    def __init__(self,pdf_dir: str):
        self.path = pdf_dir
        all_pdfs = os.listdir(self.path)
        all_pdfs = [i for i in all_pdfs if i.endswith('.pdf')]
        self.pdf_id_name_dict = {i:all_pdfs[i] for i in range(0,len(all_pdfs))}
        
    def process_extract_pdfs(self):
        full_df = pd.DataFrame()
        for id_, pdfs in self.pdf_id_name_dict.items():
            try:
                pdf_data = pdf_extraction_utils(self.path + pdfs)
                df = pdf_data.extract_data()
                df["doc_id"] = id_ 
                full_df = pd.concat([full_df,df])
            except:
                continue

        return full_df
    
    def groupby_split_sections_from_text(self, groupd_data):
        sections = groupd_data['sections'].explode()
        sections = sections.reset_index()
        return sections
    
    def detect_single_multi_sentence(self, section: str) -> bool:
        list_of_sentences = sent_tokenize(section)
        if len(list_of_sentences) > 1:
            return True
        else:
            return False
    
    def process_extract_sections_pdfs(self, section_spliter: str = "\n\n", extraction_type: str = Union["all","infer"]):
        text_df = self.process_extract_pdfs()
        text_df['sections'] = text_df['raw_texts_NarrativeText'].str.split(section_spliter)
        
        subset_df = text_df[['doc_id','page','sections']]
        sub_df = subset_df.groupby(['doc_id','page'])
        
        sub_df_section = sub_df.apply(lambda x: self.groupby_split_sections_from_text(x)).reset_index()
        sub_df_section.drop(['index'],axis=1,inplace=True)
        sub_df_section = sub_df_section.rename(columns={'level_2':'section_id'})
        
        if extraction_type == 'infer':
            sub_df_section['is_section'] = sub_df_section['sections'].apply(lambda x: self.detect_single_multi_sentence(x))
            sub_df_section = sub_df_section[sub_df_section.is_section]
            sub_df_section = sub_df_section.reset_index(drop=True)
            sub_df_section = sub_df_section.drop(['is_section'],axis=1)
        return sub_df_section

In [167]:
test = MultiPDFExtraction("../data/genai_poc/raw/")

In [26]:
test_df = test.process_extract_pdfs()

In [27]:
test_df

Unnamed: 0,page,raw_texts_NarrativeText,doc_id
0,1,Abstract: The fracture behavior of the Cu/Sn-3...,0
1,2,in their mechanical properties being different...,0
2,3,improving the reliability of solder joints. Th...,0
3,4,Figure 1. (a) The flow diagram of tensile spec...,0
4,5,inspection area was indicated by the blue rect...,0
...,...,...,...
7,8,Figure 9. The hysteresis loop for the solder j...,6
8,9,Figure 11. The evolutions in the hysteresis lo...,6
9,10,Testing temperature (°C)\n\ntion of the inelas...,6
10,11,Figure 16. The Coffin Manson equation at diffe...,6


In [28]:
test.pdf_id_name_dict

{0: 'influence of defects on mech behavior of Cu SAC305 Cu joint.pdf',
 1: 'coefficient extraction of sac305 using equation informed NNs.pdf',
 2: 'influence of flux and related factors on IMC growth in SAC305 solder joints.pdf',
 3: 'effect of microalloying and surface finish on thermal cycling SAC305.pdf',
 4: 'modeling of aged SAC305 Bi joints under various cycling conditions.pdf',
 5: 'effect of ageing on mech properties of SAC305.pdf',
 6: 'reliability modeling of fatigue lige sac305 using arrhenius model.pdf'}

In [29]:
test_df.to_csv("../data/genai_poc/processed/text_extracted_page_data_31_10_2023.csv",index=False)

In [30]:
test_section_df = test.process_extract_sections_pdfs(extraction_type='infer')

In [31]:
test_section_df

Unnamed: 0,doc_id,page,section_id,sections
0,0,1,0,Abstract: The fracture behavior of the Cu/Sn-3...
1,0,1,5,The failure of a solder joint is a problem tha...
2,0,2,0,in their mechanical properties being different...
3,0,2,1,Existing studies have conducted extensive nume...
4,0,2,2,"Currently, many related studies have been carr..."
...,...,...,...,...
519,6,10,4,Figure 15. The Coffin Manson model of SAC305 s...
520,6,11,0,Figure 16. The Coffin Manson equation at diffe...
521,6,11,2,Figure 17. The prediction models for the coffi...
522,6,11,3,when the Arrhenius model is applied. By utiliz...


In [32]:
test_section_df['sections'].iloc[520]

'Figure 16. The Coffin Manson equation at different testing temperatures.'

In [268]:
test_section_df.to_csv("../data/genai_poc/processed/text_extracted_sectional_data_25_10_2023.csv",index=False)

In [54]:
# test_section_df[test_section_df['sections']=='of the element and 𝜀(cid:3033)(cid:3043) was the equivalent plastic strain at failure. Since the length of the element was quite wide, for simplicity, the average value was used as the characteristic']

In [69]:
test_df['raw_texts_NarrativeText'].iloc[4]

'inspection area was indicated by the blue rectangle in Figure 1e. The scanning parameters of the X-ray µ-CT were optimized by considering the quality of the scanning images and the unloading effect caused by suspending the experiment. During the test, the voltage and current were 150 kV and 67 µA, respectively. The size of the X-ray raw tomography images was 2048 × 2048 pixels. To obtain a clear image of the internal structure of the pores, the specimen was scanned at 180◦, and 501 images were obtained in total. Therefore, the resolution of obtained image was 5.46 µm/pixel. The exposure time of each projection image was approximately 10 s, and the process lasted for approximately 2.5 h. The defect distribution of the Cu/SAC305/Cu solder joint was reconstructed using the VG studio software, as shown in the Figure 2. The different colors of the defects in the Figure 2 indicated the volume of each defect. The geometric shapes of defects were mainly divided into spherical shape and ﬂat sh

##### ----------------------------------------------------- #####

##### Challanges
* If a section is continuing in next page
* Parsed data layout orientation is not always fully correct
* Topic distributed accross different table of content items

In [29]:
class pymupdf_extraction_utils():
    
    def __init__(self,pdf_path = None):
        self.set_header_type = None
        self.pdf_data_path = pdf_path
        self.doc = self.read_pdf()
        
    def read_pdf(self):
        '''
        Read PDF using Langchain Unstructured PDF Loader api
        '''
        return fitz.open(self.pdf_data_path)
    
    def save_page_images_old(self, image_fold_path = None):
        zoom_x = 1.0  # horizontal zoom
        zoom_y = 1.0  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension
        
        if image_fold_path is not None:
            image_path = image_fold_path
        else:
            image_path = os.path.dirname(os.path.abspath(self.pdf_data_path))+'/out'
            
        fold = os.path.basename(self.pdf_data_path).replace('.pdf','')
        
        fold_path = os.path.join(image_path, fold)
        
        if not os.path.exists(fold_path):
            os.makedirs(fold_path)
        
        if len(glob(fold_path+'/*.jpg')) == 0:
            print("Converting Pages in Images...")
            for page in self.doc:  # iterate through the pages
                pix = page.get_pixmap(matrix=mat)  # render page to an image
                pix.save("%s/page-%i.jpg"%(fold_path, page.number))
        return fold_path
    
    def save_page_images(self, image_fold_path = None):
        
        if image_fold_path is not None:
            image_path = image_fold_path
        else:
            image_path = os.path.dirname(os.path.abspath(self.pdf_data_path))+'/out'
            
        fold = os.path.basename(self.pdf_data_path).replace('.pdf','')
        
        fold_path = os.path.join(image_path, fold)
        
        if not os.path.exists(fold_path):
            os.makedirs(fold_path)
        
        if len(glob(fold_path+'/*.jpg')) == 0:
            print("Converting Pages in Images...")
            pdf_im_data = convert_from_path(self.pdf_data_path)
            for i, pg_im in enumerate(pdf_im_data):
                pg_im.save("%s/page-%i.jpg"%(fold_path, i))
        return fold_path
    
    def remove_unicode(self,x):
        return x.encode("ascii", "ignore").decode()
    
    def get_table_of_content(self):
        return self.doc.get_toc()
    
    def get_page_content(self, page_no):
        pg = self.doc.load_page(page_no)
        paras = pg.get_text('blocks')

        blocks = []
        for para in paras:
            para_content = re.sub(' +', ' ', para[4].replace('\n',' ').strip())
            blocks.append(para_content)
        return blocks
    
    def section_name_splitter(self, x):
        spliter = re.findall("[0-9].1.",x)[0]
        hh = x.split(f' {spliter} ')
        return [hh[0], spliter+' '+hh[1]]
    
    def section_name_merger(self, x: list):
        merged_string = f"{x[0]} {x[1]}"
        return merged_string

    def extract_page_data(self):
        '''
        Extract the raw texts of different categories from the passed pdf
        '''
        documents = self.doc
        
        page_blocks = {}
        for pg_no in range(documents.page_count):
            pg_content = documents.load_page(pg_no)
            
            paras = pg_content.get_text('blocks')
            
            blocks = []
            for para in paras:
                blocks.append(para[4].replace('\n',' ').strip())
            
            page_blocks[pg_no] = blocks
        
        return page_blocks
    
    def get_content_structure(self):
        temp = []
        sec_count = 0
        sub_sec = 0
        l = self.get_table_of_content()
        if len(l) == 0:
            l = self.get_table_of_content_ocr()
        for j in range(0,len(l)):
            tag = 'root'
            i = l[j]
            level, section_name, page = i[0], i[1].strip(), i[2]
            if level == 1:
                root_section_name = section_name
                sec_count = sec_count + 1
                sub_sec = 0
                try:
                    if l[j+1][0] > 1:
                        tag = "root_with_child"
                except:
                    pass
            if level>1:
                tag = 'child'
                sub_sec += 1 
                if sub_sec == 1:
                    section_text_name = f'{sec_count}. {root_section_name} {sec_count}.{sub_sec}. {section_name}'
                else:
                    section_text_name = f'{sec_count}.{sub_sec}. {section_name}'
            elif section_name == "References":
                section_text_name = f'{section_name}'
            else:
                section_text_name = f'{sec_count}. {section_name}'
            temp.append([section_text_name, page, tag])
        return temp
    
    
    def get_content_structure_working(self):
        temp = []
        sec_count = 0
        sub_sec = 0
        
        if self.extraction_mode == "auto":
            l = self.get_table_of_content()
            process_flag = "digital"
            if len(l) == 0:
                l = self.get_table_of_content_ocr_working()
                process_flag = "ocr"
        elif self.extraction_mode == 'digital':
            l = self.get_table_of_content()
            process_flag = "digital"
        elif self.extraction_mode == 'ocr':
            l = self.get_table_of_content_ocr_working()
            process_flag = "ocr"
            
        for j in range(0,len(l)):
            tag = 'root'
            i = l[j]
            level, section_name, page = i[0], i[1].strip(), i[2]
            if level == 1:
                root_section_name = section_name
                sec_count = sec_count + 1
                sub_sec = 0
                try:
                    if l[j+1][0] > 1:
                        tag = "root_with_child"
                except:
                    pass
            if level>1:
                tag = 'child'
                sub_sec += 1 
                section_text_name = f'{sec_count}.{sub_sec}. {section_name}'
            elif section_name == "References":
                section_text_name = f'{section_name}'
            else:
                section_text_name = f'{sec_count}. {section_name}'
            temp.append([section_text_name, page, tag])
        return process_flag, temp
    
    
    def get_table_of_content_ocr(self, image_fold_path = None, header_type = 'caps'):
        img_fold_path = self.save_page_images(image_fold_path = image_fold_path)
        
        pg_headers = []
        for i, pg in enumerate(self.doc):
            filename = f"{img_fold_path}/page-{str(i)}.jpg"
            files = {'my_file': (filename, open(filename, 'rb'))}

            response = requests.post(
                'http://20.83.24.160:8889/getLayout',
                files=files)
            headers = response.json()['header']
            
            if self.set_header_type is not None:
                header_type = self.set_header_type
                
            if header_type == 'caps':
                headers = [j for j in headers if j.isupper()]
                
            if len(headers) != 0:
                pg_content = self.get_page_content(i)
                header_pos_ind = {}
                for h in headers:
                    try:
                        header_pos_ind[h] = pg_content.index(h)
                    except:
                        for j, line in enumerate(pg_content):
                            if h in line:
                                header_pos_ind[h] = j
                                break
                headers_sorted = [k for k, v in sorted(header_pos_ind.items(), key=lambda item: item[1])]
                for h in headers_sorted:
                    pg_headers.append([1, h, i+1])
        return pg_headers
    
    
    def get_table_of_content_ocr_working(self, image_fold_path = None, header_type = 'caps'):
        img_fold_path = self.save_page_images(image_fold_path = image_fold_path)

        if os.path.exists(f"{img_fold_path}/header.h"):
            with open(f"{img_fold_path}/header.h", "rb") as f:
                pg_headers = pickle.load(f)
        else:
            pg_headers = []
            for i, pg in enumerate(self.doc):
                filename = f"{img_fold_path}/page-{str(i)}.jpg"
                files = {'my_file': (filename, open(filename, 'rb'))}

                response = requests.post(
                    'http://20.83.24.160:8889/getLayout',
                    files=files)
                headers = response.json()['header']
                
                headers = [i for i in headers if i != '']
                
                if self.set_header_type is not None:
                    header_type = self.set_header_type
                
                if header_type == 'caps':
                    headers = [j for j in headers if j.isupper()]

                if len(headers) != 0:
                    pg_content = self.get_page_content(i)
                    
                    #pg_content = [' '.join(pg_content)]
                    
                    line_headers = OrderedDict()
                    for j, line in enumerate(pg_content):
                        temp_line_header = []
                        for h in headers:
                            if h in line:
                                temp_line_header.append(h)
                        line_headers[line] = temp_line_header

                    line_headers = {k:v for k,v in line_headers.items() if len(v) != 0}

                    headers_sorted = []
                    for k, v in line_headers.items():
                        if len(v) >1:
                            v.sort(key=len, reverse=True) 
                            for it_ in v:
                                k = k.replace(it_, '_'.join(it_.split()))

                            k_split = k.split(" ")

                            index_dict = {}

                            for it_ in v:
                                try:
                                    index_dict[it_] = k_split.index('_'.join(it_.split()))
                                except:
                                    continue

                            index_dict_key_sorted = [k for k, v in sorted(index_dict.items(), key=lambda item: item[1])]
                            headers_sorted.extend(index_dict_key_sorted)
                        else:
                            headers_sorted.append(v[0])
                    for h in headers_sorted:
                        pg_headers.append([1, h, i+1])

            with open(f"{img_fold_path}/header.h","wb") as f:
                pickle.dump(pg_headers,f)
            
        return pg_headers
    
    
    def get_content_structure_filtered(self):
        _ , all_content_struc = self.get_content_structure_working()
        filtered_content_struc = [[i[0].encode("ascii", "ignore").decode(), i[1]] for i in all_content_struc] #if i[2]!= 'root_with_child']
        #filtered_content_struc = [[re.sub(r'[^\x00-\x7F]+',' ', i[0]), i[1]] for i in all_content_struc]        
        filtered_content_struc.append(['DUMMY',filtered_content_struc[-1][1]])
        return _ , filtered_content_struc
    
    
    def get_content_section_wise(self, extraction_mode = 'auto', set_header_type = None):
        
        if set_header_type is None:
            self.set_header_type = 'all'
        else:
            self.set_header_type = set_header_type
            
        self.extraction_mode = extraction_mode
        
        process_flag, ll = self.get_content_structure_filtered()
        
        print(ll)
        print(f"\n\nProcessing through : {process_flag}")
        if process_flag == "digital":
            all_sec_content = []
            j = 0
            sec_content = ''
            breaker = 3
            dic = {}
            
            try:
                while j < len(ll)-1:
                    sec_start_page_no, start_sec_name = ll[j][1], ll[j][0]
                    sec_end_page_no, end_sec_name = ll[j+1][1], ll[j+1][0]
                    flag = False
                    for i in range(sec_start_page_no, sec_end_page_no+1):
                        pg = self.doc.load_page(i-1)
                        paras = pg.get_text('blocks')

                        blocks = []
                        for para in paras:
                            blocks.append(para[4].replace('\n',' ').strip().encode("ascii", "ignore").decode())

                        if i != sec_end_page_no:
                            sec_content += ' '.join(blocks)
                        else:
                            try:
                                next_section_header_index = blocks.index(end_sec_name)
                                
                                till_pg_section = blocks[:next_section_header_index]
                                sec_content += ' '.join(till_pg_section)

                                next_sec_start_section = blocks[next_section_header_index:]
                                next_sec_start_content = ' '.join(next_sec_start_section)
                            except Exception as e:
                                #print(e)
                                #print(blocks, " ---- ", start_sec_name, " --- ",end_sec_name,">>>>>>>>>>>>>>>>> \n\n")
                                if end_sec_name != "References":
                                    new_ll = ll[:j+1]
                                    n_1 = ll[j+1][0]
                                    n_2 = ll[j+2][0]
                                    merged_section_name = pdf.section_name_merger([n_1, n_2])
                                    new_ll.append([merged_section_name, sec_end_page_no])
                                    new_ll.extend(ll[j+3:])
                                    ll = new_ll
                                else:
                                    ll[-2][1] += 1
                                flag = True
                                break

                    if flag == False:
                        dic[start_sec_name] = sec_content
                        #all_sec_content.append([start_sec_name, sec_content])
                        sec_content = next_sec_start_content
                        j += 1
                    else:
                        sec_content = ''
            except Exception as e:
                #print(e)
                pass
            return dic
        else:
            all_sec_content = []
            j = 0
            full_content = ''

            for pg_no in range(self.doc.page_count):
                full_content += ' '.join(self.get_page_content(pg_no))

            dic = {}
            for i, sec_pg in enumerate(ll):
                header = sec_pg[0]
                temp = full_content.split(header)
                if len(temp) == 1:
                    try:
                        number = re.findall("[0-9]+. ",header)[0]
                        #print(header,": ",number,"#####\n\n")
                        header_ = header.replace(number,'')
                        header_ = re.sub(' +', ' ',header_)
                    except Exception as e:
                        header_ = header
                    
                    try:
                        temp = full_content.split(header_)
                        first_por = temp[0]
                        second_por = temp[1]
                    except Exception as e:
                        #print(e)
                        pass
                else:
                    first_por = temp[0]
                    second_por = temp[1]

                if i > 0:
                    dic[prev_header_mem] = first_por
                prev_header_mem = header
                full_content = second_por
                
            return dic

In [277]:
# all_pdfs = os.listdir("../data/genai_poc/raw/")

# all_pdfs = [i for i in all_pdfs if i.endswith('.pdf')]

In [278]:
# all_pdfs

In [279]:
# all_pdf_contents = {}
# for pdf_file in all_pdfs:
#     pdf = pymupdf_extraction_utils('../data/genai_poc/raw/' + pdf_file)
#     all_pdf_contents[pdf_file] = pdf.get_content_section_wise()

In [280]:
# all_pdf_contents[list(all_pdf_contents.keys())[6]]

In [281]:
#effect of microalloying and surface finish on thermal cycling SAC305.pdf - working
#influence of defects on mech behavior of Cu SAC305 Cu joint.pdf - working
#influence of flux and related factors on IMC growth in SAC305 solder joints.pdf - 
#modeling of aged SAC305 Bi joints under various cycling conditions.pdf - working
#reliability modeling of fatigue lige sac305 using arrhenius model.pdf - 

#Qualitative Model Describing Hot Tear Above VIPPO and Numerous Other Design Elements.pdf - Check (High Priority) [Faulty PDF. Page 3 and 4 are same]


In [25]:
# pdf_data_extracted.to_pickle("../data/genai_poc/processed/AI_POC_Papers_Dec11_SMTA_all_pdf_data.pkl")

In [34]:
class extract_section_for_all_pdf(pymupdf_extraction_utils):
    
    def __init__(self, pdf_folder_path):
        self.pdf_folder_path = pdf_folder_path
        
    def list_all_files_from_folder(self, path):
        all_pdfs = os.listdir(path)
        all_pdfs = [i for i in all_pdfs if i.endswith('.pdf')]
        pdf_id_name_dict = {i:all_pdfs[i] for i in range(0,len(all_pdfs))}
        return pdf_id_name_dict
    
    def extract_sectional_data(self):
        all_pdfs_to_process = self.list_all_files_from_folder(self.pdf_folder_path)

        temp = []
        for id_, pdf_file in all_pdfs_to_process.items():
            pdf = pymupdf_extraction_utils(self.pdf_folder_path+pdf_file)
            try:
                print(f"Processing for {pdf_file} ...")
                content = pdf.get_content_section_wise(extraction_mode="ocr", set_header_type='caps')
                temp.append([id_, pdf_file, content])
            except:
                print(f"Issue with {pdf_file} to process")
            print("\n\n=====================================\n\n")
        
        return pd.DataFrame(temp, columns=['id','pdf','data'])

In [76]:
gen = extract_section_for_all_pdf("../data/genai_poc/raw/AI_POC_47Papers_Nov06_SMTA/")

In [77]:
df_oct_47 = gen.extract_sectional_data()

Processing for EFFECT OF REFLOW TIME ON WETTING BEHAVIOUR, INTERFACIAL REACTION.pdf ...
[['1. EXPERIMENTAL', 2], ['2. Shear test results', 4], ['3. CONCLUSION', 5], ['DUMMY', 5]]


Processing through : ocr




Processing for Predicting the Saturation of Solder Joint Cycles to Failure with Thermal Cycling Dwell Times.pdf ...
[['1. ABSTRACT', 1], ['2. SATURATION CYCLES PHENOMENOLOGICAL', 5], ['3. PREDICTIVE SATURATION MODEL', 6], ['4. CONCLUSIONS', 8], ['5. ACKNOWLEDGEMENTS', 9], ['6. REFERENCES', 9], ['DUMMY', 9]]


Processing through : ocr




Processing for Qualitative Model Describing Hot Tear Above VIPPO and Numerous Other Design Elements.pdf ...
[['1. QUALITATIVE MODEL DESCRIBING HOT TEAR ABOVE VIPPO AND NUMEROUS', 1], ['2. ABSTRACT', 1], ['3. INTRODUCTION', 1], ['4. MODEL FOR HOT TEAR AT INTERFACES', 2], ['5. DIRECTIONAL PHASE TRANSITIONS', 3], ['6. DIRECTIONAL PHASE TRANSITIONS', 4], ['7. LOAD', 5], ['8. CONCLUSION', 7], ['9. REFERENCES', 7], ['DUMMY', 7]]


Processing through : 





Processing for CRACK GROWTH RATE MEASUREMENT AND ANALYSIS FOR WLCSP Sn-Ag-Cu SOLDER JOINTS.pdf ...
[['1. CRACK GROWTH RATE MEASUREMENT AND ANALYSIS FOR WLCSP', 1], ['2. ABSTRACT', 1], ['3. INTRODUCTION', 1], ['4. TEST SAMPLE PREPARATION', 1], ['5. TEST SAMPLE CHARACTERIZATION', 1], ['6. TEST PROCEDURE', 2], ['7. MODELING', 2], ['8. RESULTS', 3], ['9. MEASUREMENT OF CRACK AREA', 3], ['10. VARIATION IN MEASUREMENTS', 4], ['11. MICROSTRUCTURAL EVALUATION', 5], ['12. CONCLUSIONS', 6], ['13. ACKNOWLEDGEMENTS', 7], ['14. REFERENCES', 7], ['DUMMY', 7]]


Processing through : ocr




Processing for PROCESS CAPABILITY, WETTING BEHAVIOR AND TEMPERATURE DEPENDENT SHEAR STRENGTH OF ALTERNATIVE LEAD FREE SOLDER JOINTS.pdf ...
[['1. ABSTRACT', 1], ['2. INTRODUCTION', 1], ['3. EXPERIMENTS', 2], ['4. RESULTS AND DISCUSSION', 2], ['5. REFERENCES', 6], ['DUMMY', 6]]


Processing through : ocr




Processing for High-Reliability Lead-free Solder for Electronics in Automotive Applications.pdf ...
[['1

In [78]:
gen = extract_section_for_all_pdf("../data/genai_poc/raw/AI_POC_Papers_Dec11_SMTA/")

In [79]:
df_dec_30 = gen.extract_sectional_data()

Processing for IPC-CC-830B vs The Real World Part Factors Influencing the Tendency for A Conformal Coating to Crack During Thermal Shock Cycling.pdf ...
[['1. ABSTRACT', 1], ['2. INTRODUCTION', 1], ['DUMMY', 1]]


Processing through : ocr




Processing for Evaluations of High Reliability Lead-Free Solder Paste.pdf ...
[['1. ABSTRACT', 1], ['2. EXPERIMENTAL', 1], ['3. RESULTS AND DISCUSSION', 4], ['4. EPMA', 5], ['5. EPMA', 5], ['6. CONCLUSIONS', 10], ['DUMMY', 10]]


Processing through : ocr




Processing for Assessment of the Behavior of High Reliability Solder Alloys in Accelerated Thermal Testing.pdf ...
[['1. ASSESSMENT OF THE BEHAVIOR OF HIGH RELIABILITY SOLDER', 1], ['2. ABSTRACT', 1], ['3. INTRODUCTION', 1], ['4. RESULTS AND DISCUSSION', 3], ['5. CONCLUSION', 9], ['DUMMY', 9]]


Processing through : ocr




Processing for HnP Defects by Emulating The Gap Between BGA and PCB During Reflow.pdf ...
[['1. ABSTRACT', 1], ['2. NTRODUCTION', 1], ['3. CONCLUSION', 7], ['4. REFERENCES'





Processing for Degradation of Leadfree Solder Materials Subjected to Isothermal Aging With Use of the CABGA208 Package.pdf ...
[['1. DEGRADATION OF LEADFREE SOLDER MATERIALS SUBJECTED', 1], ['2. ABSTRACT', 1], ['3. INTRODUCTION', 1], ['4. TEST VEHICLE', 2], ['5. ISOTHERMAL TESTING PROCESS', 2], ['6. CONCLUSION', 5], ['7. REFERENCES', 5], ['DUMMY', 5]]


Processing through : ocr




Processing for Microstructure of Weak-Micro-Via and its Failure Prevention.pdf ...
[['1. ABSTRACT', 1], ['2. EXPERIMENTAL', 1], ['3. RESULTS', 2], ['4. CONCLUSION', 6], ['5. ACKNOWLEDGEMENTS', 6], ['6. REFERENCE', 6], ['DUMMY', 6]]


Processing through : ocr






In [80]:
df_dec_30.shape, df_oct_47.shape

((30, 3), (39, 3))

In [90]:
df = pd.concat([df_dec_30, df_oct_47], ignore_index=True)

In [91]:
df.reset_index(inplace=True)

In [95]:
df = df.drop('id',axis=1).rename(columns={'index':'id'})

In [97]:
df_grpd = df.groupby(['id'])

In [98]:
def grpd_func(x):
    temp_data = x['data'].values.tolist()[0]
    data_values = []
    for k,v in temp_data.items():
        data_values.append([k, v])
    return pd.DataFrame(data_values, columns=['section','content'])

In [99]:
df_sub = df_grpd.apply(lambda x: grpd_func(x)).reset_index().drop("level_1",axis=1)

In [100]:
df_extended = df[['id','pdf']].merge(df_sub, on = 'id', how = 'inner')

In [101]:
df_extended

Unnamed: 0,id,pdf,section,content
0,0,IPC-CC-830B vs The Real World Part Factors Inf...,1. ABSTRACT,As electronics continue to become ever more d...
1,0,IPC-CC-830B vs The Real World Part Factors Inf...,2. INTRODUCTION,With the increased adoption of electronics in...
2,1,Evaluations of High Reliability Lead-Free Sold...,1. ABSTRACT,In recent years there has been an increased d...
3,1,Evaluations of High Reliability Lead-Free Sold...,2. EXPERIMENTAL,Alloy Properties Testing was done on a develo...
4,1,Evaluations of High Reliability Lead-Free Sold...,3. RESULTS AND DISCUSSION,Melting Point The melting point of the HR6A a...
...,...,...,...,...
497,68,Component and Printed Wiring Board Finish Effe...,7. Void Formation,Typical X-ray results showing the amount of v...
498,68,Component and Printed Wiring Board Finish Effe...,8. CONCLUSIONS,Finish effects on QFN thermal pad solder join...
499,68,Component and Printed Wiring Board Finish Effe...,9. FUTURE WORK,Possible future work will be to evaluate addi...
500,68,Component and Printed Wiring Board Finish Effe...,10. ACKNOWLEDGEMENTS,The author would like to acknowledge L3Harris...


In [9]:
pdf = pymupdf_extraction_utils('../data/genai_poc/raw/AI_POC_Papers_Dec11_SMTA/Acceleration Life Comparison of Solder Joint with Temperature Deviation under Thermal Cycling.pdf')

In [10]:
content = pdf.get_content_section_wise(extraction_mode="ocr", set_header_type='caps')

Converting Pages in Images...
[['1. ACCELERATION LIFE COMPARISON OF SOLDER JOINT WITH', 1], ['2. ABSTRACT', 1], ['3. EXPERIMETNAL PROCEDURE', 1], ['4. RESULTS AND DISCUSSION', 2], ['5. CONCLUSIONS', 4], ['6. REFERENCES', 4], ['DUMMY', 4]]


Processing through : ocr


In [11]:
content

{'1. ACCELERATION LIFE COMPARISON OF SOLDER JOINT WITH': ' TEMPERATURE DEVIATION UNDER THERMAL CYCLING  Won Sik Hong, Ph.D. and Mi Song Kim, Ph.D. Candidate Korea Electronics Technology Institute (KETI) Republic of Korea wshong@keti.re.kr; althd267@keti.re.kr  ',
 '2. ABSTRACT': ' To compare solder joint reliability of automotive electronics, various thermal cycling test conditions for accelerated life test (ALT) are used. ALT conditions of car electronics are different with mounted positions which is divided into engine room, cabin, truck and area of no exposed to direct sunlight. Accordingly, since the degradation level of the solder joint according to the thermal cycle test conditions is different, there is a need to quantitatively compare the degradation of the solder joint according to each ALT condition. Thus, in this study, we conducted various thermal cycling test using by 3216 chip resistors mounted test coupon. 3216 chip resistor mounted test vehicle was soldered with Sn- 3.0

In [47]:
# all_sec_content = []
# j = 0
# sec_content = ''

# while j < len(ll)-1:
#     sec_start_page_no, start_sec_name = ll[j][1], ll[j][0]
#     sec_end_page_no, end_sec_name = ll[j+1][1], ll[j+1][0]
#     flag = False
#     for i in range(sec_start_page_no, sec_end_page_no+1):
#         pg = doc.load_page(i-1)
#         paras = pg.get_text('blocks')
        
#         blocks = []
#         for para in paras:
#             blocks.append(para[4].replace('\n',' ').strip())
            
#         if i != sec_end_page_no:
#             sec_content += ' '.join(blocks)
#         else:
#             try:
#                 next_section_header_index = blocks.index(end_sec_name)
#                 till_pg_section = blocks[:next_section_header_index]
#                 sec_content += ' '.join(till_pg_section)
                
#                 next_sec_start_section = blocks[next_section_header_index:]
#                 next_sec_start_content = ' '.join(next_sec_start_section)
#             except Exception as e:
#                 print(e)
#                 splitter_section_names = pdf.section_name_splitter(end_sec_name)
#                 new_ll = ll[:j+1]
#                 new_ll.append([splitter_section_names[0], sec_end_page_no])
#                 new_ll.append([splitter_section_names[1], sec_end_page_no])
#                 new_ll.extend(ll[j+2:])
#                 ll = new_ll
#                 flag = True
#                 break
    
#     if flag == False:
#         all_sec_content.append([start_sec_name, sec_content])
#         sec_content = next_sec_start_content
#         j += 1
#     else:
#         sec_content = ''

In [None]:
# all_sec_content = []
# for j in range(len(ll)):
#     sec_start_page_no, start_sec_name = ll[j][1], ll[j][0]
#     sec_end_page_no, end_sec_name = ll[j+1][1], ll[j+1][0]
    
#     sec_content = ''
    
#     for i in range(sec_start_page_no, sec_end_page_no+1):
#         pg = doc.load_page(i-1)
#         paras = pg.get_text('blocks')
        
#         blocks = []
#         for para in paras:
#             blocks.append(para[4].replace('\n',' ').strip())
            
#         if i != sec_end_page_no:
#             sec_content += ' '.join(blocks)
#         else:
#             try:
#                 next_section_header_index = blocks.index(end_sec_name)
#             except:
#                 splitter_section_names = pdf.section_name_splitter(end_sec_name)
#                 ll[j][0] = splitter_section_names[0]
#                 ll[j+1][0] = splitter_section_names[1]
                
#             till_pg_section = blocks[:next_section_header_index]
#             sec_content += ' '.join(till_pg_section)
#     all_sec_content.append([start_sec_name, sec_content])