In [None]:
import hydra
import lightning.pytorch as pl
import llama_index.core.query_pipeline
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import tqdm

import qut01

In [None]:
logger = qut01.utils.logging.setup_logging_for_analysis_script()
data_config_name = "statement_sampler.yaml"
logger.info(f"initializing hydra and fetching data config for '{data_config_name}'...")
overrides = [
    f"data={data_config_name}",
    "data.classif_setup=c2-c3-c4-c5-c6",
    "data.num_criteria=8",
]
config = qut01.utils.config.init_hydra_and_compose_config(overrides=overrides)
logger.info("initialization complete!")

In [None]:
role_and_background_str = (  # generic
    "You are an analyst that inspects modern slavery declarations made by Australian reporting entities. "
    "You are specialized in the analysis of statements made with respect to the Australian Modern Slavery "
    "Act of 2018, and not of any other legislation."
)

definitions_c2_sc_str = (  # c2-sc
    # note: description below is summarized from the spec, which is largely based on the guidance doc
    "You are currently looking for sentences in statements that describe the SUPPLY CHAINS of an entity, "
    "where supply chains refer to the sequences of processes involved in the procurement of products and "
    "services (including labour) that contribute to the reporting entity's own products and services. The "
    "description of a supply chain can be related, for example, to 1) the products that are provided by "
    "suppliers; 2) the services provided by suppliers, or 3) the location, category, contractual "
    "arrangement, or other attributes that describe the suppliers. Any sentence that contains these kinds "
    "of information is considered relevant. Descriptions that apply to indirect suppliers (i.e. "
    "suppliers-of-suppliers) are considered relevant. Descriptions of the supply chains of entities owned "
    "or controlled by the reporting entity making the statement are also considered relevant. However, "
    "descriptions of 'downstream' supply chains, i.e. of how customers and clients of the reporting "
    "entity use its products or services, are NOT considered relevant. Finally, sentences that describe "
    "how the reporting entity lacks information on some of its supply chain, or how some of its supply "
    "chains are still unmapped or unidentified, are also considered relevant."
)

relevance_classif_descriptions_str = (  # @@@@@ note: this is for "DESCRIPTIONS", not actions
    "Given the above definitions of what constitutes a relevant sentence, you will need to determine if a "
    "target sentence is relevant or not inside a larger block of text. The target sentence will first be "
    "provided by itself so you can know which sentence we want to classify. It will then be provided again "
    "as part of the larger block of text it originally came from (extracted from a PDF file) so you can "
    "analyze it with more context. While some of the surrounding sentences may be relevant according to "
    "the earlier definitions, we are only interested in classifying the target sentence according to the "
    "relevance of its own content. You must avoid labeling sentences with only vague descriptions or "
    "corporate talk (and no actual information) as relevant."
)

answer_formatting_str = (
    "The answer you provide regarding whether the sentence is relevant or not can only be 'YES' or 'NO', "
    "and nothing else."
)

# examples_c2_sc_str = (
#     "todo" @@@@@
# )

target_sentence_str = (
    "The target sentence to classify is the following:" "\n------------\n{target_sentence}\n------------"
)

sentence_with_context_str = (
    "The same target sentence inside its original block of text:" "\n------------\n{text}\n------------"
)

final_question_str = "Is the target sentence relevant? (YES/NO)"

template_c2_sc_str = (
    f"{role_and_background_str}\n\n"
    f"{definitions_c2_sc_str}\n\n"
    f"{relevance_classif_descriptions_str}\n\n"
    f"{answer_formatting_str}\n\n"
    # f"{examples_c2_sc_str}\n\n"
    f"{target_sentence_str}\n\n"
    f"{sentence_with_context_str}\n\n"
    f"{final_question_str}"
)

In [None]:
import llama_index.core
import llama_index.core.query_pipeline
import llama_index.llms.openai

query_target_class = "c2 (supply chains)"

print(f"{query_target_class} prompt template preview:\n\n{template_c2_sc_str}")
prompt_c2_sc = llama_index.core.PromptTemplate(template_c2_sc_str)

llm_gpt3 = llama_index.llms.openai.OpenAI(
    model="gpt-3.5-turbo",
    # temperature: float = DEFAULT_TEMPERATURE,
    # max_tokens: Optional[int] = None,
    # additional_kwargs: Optional[Dict[str, Any]] = None,
    # max_retries: int = 3,
    # timeout: float = 60.0,
    # reuse_client: bool = True,
)

query_pipeline = llama_index.core.query_pipeline.QueryPipeline(
    chain=[prompt_c2_sc, llm_gpt3],
    verbose=True,
)

In [None]:
logger.info(f"Instantiating datamodule: {config.data.datamodule._target_}")  # noqa
datamodule: pl.LightningDataModule = hydra.utils.instantiate(config.data.datamodule)
assert isinstance(datamodule, pl.LightningDataModule), f"unexpected type: {type(datamodule)}"
logger.info("running 'datamodule.prepare_data()'...")
datamodule.prepare_data()
logger.info("running 'datamodule.setup()'...")
datamodule.setup(stage="fit")
target_subset_name = "valid"
logger.info(f"fetching {target_subset_name} data loader...")
dataloader_getter = getattr(datamodule, f"{target_subset_name}_dataloader")
dataloader = dataloader_getter()
logger.info(f"{target_subset_name} data loader ready!")

In [None]:
target_relevance_flag = 0
while not target_relevance_flag > 0:
    batch = next(iter(dataloader))
    batch_size = batch["batch_size"]
    sample_idx = np.random.randint(batch_size)
    sample_id = batch["batch_id"][sample_idx]
    target_sentence = batch["sentence_orig_text"][sample_idx]
    text = batch["text"][sample_idx]
    target_relevance_flag = batch["relevance"][sample_idx][batch["class_names"].index(query_target_class)].item()
    print(f"{sample_id=}")
    print(f"{target_sentence=}")
    print(f"{text=}")
    print(f"{target_relevance_flag=}")
    print("\n\n")

In [None]:
response = query_pipeline.run(target_sentence=target_sentence, text=text)

In [None]:
response.message.content

In [None]:
response.dict()