In [1]:
%load_ext autoreload
%autoreload 2

from llm import load_models
from dotenv import load_dotenv
from questioning import load_qdf, ask_qdfs, pdf_to_documents, documents_to_vector_store, query_action_detail, query_action_list, ask_RAG
import json
from logging_config import logging_config
logger = logging_config(f'GPT@JRC','logger.log')
import pandas as pd
from tqdm import tqdm

action_page_limit = {
    'Torino.pdf': (143, 191),
    'Zaragoza.pdf': (0, 200),
    'Bologna.pdf': (128, 194),
    'Izmir.pdf': (58, 142)
}

load_dotenv()
llm, emb = load_models(service = 'GPT@JRC')
qdf = load_qdf() # questions from JSON
pdf_prefix='pdf_input/'

In [1]:
pdf = 'Torino.pdf'
document, documents = pdf_to_documents(f'{pdf_prefix}{pdf}')
vector_store = documents_to_vector_store(documents, emb)

action_list = query_action_list(
    llm=llm, 
    documents=documents,
    page_start= action_page_limit[pdf][0],
    page_end=action_page_limit[pdf][1],
    logger=logger
)


NameError: name 'pdf_to_documents' is not defined

In [None]:
def pdf_to_excel(pdf, excel_output, llm=llm, emb=emb, pdf_prefix='pdf_input/'):
    logger.info(f'Starting PDF {pdf}')    
    logger.debug('Reading the documents..')
    document, documents = pdf_to_documents(f'{pdf_prefix}{pdf}')
    
    logger.debug('Converting to vector store ...')
    vector_store = documents_to_vector_store(documents, emb)

    with pd.ExcelWriter(excel_output) as xlsx:

        qs_responses = ask_qdfs(
            qdf = qdf,
            llm = llm, 
            vector_store = vector_store,
            pdf = pdf,
            logger=logger
        )

        qs_responses.to_excel(xlsx,sheet_name='standard_qs')

        bars = pd.DataFrame(ask_RAG(
            embed_query = 'List all barriers to accelerating emission reduction in view of the 2030 climate-neutrality goal that are explicitly described in the plan',
            vector_store = vector_store,
            llm = llm,
            logger = logger,
            template_key = 'barriers_prompt',
            template_kwargs={},
            pdf = pdf
        ))
        bars.to_excel(xlsx, sheet_name = 'barriers')

        
        participatory_processes = pd.DataFrame(ask_RAG(
            embed_query='''
            - List all participatory process described in the SECAP plan.
            - Describe how citizens' contribute to specific priorities in the plan
            - Inclusion of participatory processes and engagement of stakeholder
            - Barriers to /challenges for citizen engagement and solutions/opportunities to remove it
            ''',
            template_key='participatory_processes',
            template_kwargs={},
            vector_store=vector_store,
            llm=llm,
            logger=logger,
            pdf=pdf
        ))
        participatory_processes.to_excel(xlsx, sheet_name='participatory_processes')

        logger.info(f'Starting actions in {pdf}')

        action_list = query_action_list(
            llm=llm, 
            documents=documents,
            page_start= action_page_limit[pdf][0],
            page_end=action_page_limit[pdf][1],
            logger=logger
        )
        ads = [query_action_detail(
                action=action, 
                template_key='action_details', 
                llm=llm, 
                vector_store=vector_store, 
                logger=logger,
                pdf = pdf
            ) 
            for action in tqdm(action_list,desc='Action details')]
        action_details = pd.json_normalize([ad for ad in ads if ad is not None])

        action_details.to_excel(xlsx, sheet_name='Actions')

        smarts = [query_action_detail(
                action=action,
                template_key='action_SMART', 
                llm=llm,
                vector_store=vector_store, 
                logger=logger,
                pdf = pdf
            ) for action in tqdm(action_list,desc='Action SMART')]
        action_SMART = pd.json_normalize([sm for sm in smarts if sm is not None])

        action_SMART.to_excel(xlsx, sheet_name='Actions_SMART')

for pdf in ['Torino.pdf','Izmir.pdf']:
    pdf_to_excel(pdf, f'output/{pdf.replace("pdf","xlsx")}')