In [1]:
import os 
with open(".env", "r") as f:
    for line in f:
        key, value = line.strip().split("=")
        os.environ[key] = value

import palimpzest as pz

### Define relevant schemata
For the Paper Refernce extraction workload, the relevant entities are a Scientific Paper (an annotated PDF document) and a Reference (a citation to another document). The schema for the Scientific Paper is as follows:

| Title | Year | Author | Journal | Subject | DOI |
|----|----|----|----|----|----|
| Title of the paper   | 2024    | Jane Doe  | Journal X | Paper is about ... | 10.1234/5678 |

The schema for the Reference is as follows:
| Index | Title | Author | Year |
|----|----|----|----|
| 0   | Referenced paper | John Smith | 2000 | 


In [3]:
class ScientificPaper(pz.PDFFile):
   """Represents a scientific research paper, which in practice is usually from a PDF file"""
   paper_title = pz.Field(desc="The title of the paper. This is a natural language title, not a number or letter.", required=True)
   paper_year = pz.Field(desc="The year the paper was published. This is a number.", required=False)
   paper_author = pz.Field(desc="The name of the first author of the paper", required=True)
   paper_journal = pz.Field(desc="The name of the journal the paper was published in", required=True)
   paper_subject = pz.Field(desc="A summary of the paper contribution in one sentence", required=False)
   paper_doiURL = pz.Field(desc="The DOI URL for the paper", required=True)

class Reference(pz.Schema):
    """ Represents a reference to another paper, which is cited in a scientific paper"""
    reference_index = pz.Field(desc="The index of the reference in the paper", required=True)
    reference_title = pz.Field(desc="The title of the paper being cited", required=True)
    reference_first_author = pz.Field(desc="The author of the paper being cited", required=True)
    reference_year = pz.Field(desc="The year in which the cited paper was published", required=True)

### Define Workload

The next cell defines the workload to extract the references from a scientific paper. First, we load a folder from our drive that contains the PDF files we want to process.
These PDF are parsed using the Scientific Paper schema, then they are filtered for those that mention the phosphorylation of Exo1.
Finally, we extract the Reference objects of the scientific papers.

In [4]:
## Define workload
papers = pz.Dataset("bdf-usecase3-tiny", schema=ScientificPaper)
papers = papers.filter("The paper mentions phosphorylation of Exo1")
references = papers.convert(Reference, desc="A paper cited in the reference section", cardinality="oneToMany")

### Execute Workload
Finally, we execute the workload and print the results of the processing, as well as the statistics about how much time and cost it takes to process the workload.
First, we run the workload deciding to optimize for cost using the `MinCost` policy.

In [5]:
policy = pz.MinCost()

engine = pz.StreamingSequentialExecution

iterable  =  pz.Execute(references,
                        allow_bonded_query=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        policy = policy,
                        execution_engine=engine)

print()
for results, plan, stats in iterable:
    for r in results:
        if r.reference_title:
            print(f"{r.reference_first_author}, '{r.reference_title}', {r.reference_year}", flush=True)


Available models:  [GPT_3_5, GPT_4]

LOGICAL PLANS: 2
INITIAL PLANS: 144
INITIAL PLANS: 80
DEDUP PLANS: 60
PARETO PLANS: 3
Time for planning: 0.25363683700561523
Mimitou, 'Nucleases and helicases take center stage in homologous recombination', 2009
Shiotani, 'Single-stranded DNA orchestrates an ATM-to-ATR switch at DNA breaks', 2009
Sartori, 'Human CtIP promotes DNA end resection', 2007
Bernstein, 'At loose ends: resecting a double-strand break', 2009
None, 'A DNA exonuclease induced during meiosis of Schizosaccharomyces pombe', 2008
None, 'Molecular interactions of human Exo1 with DNA', 2005
None, 'Human exonuclease 1 functionally complements its yeast homologues in DNA recombination, RNA primer removal, and mutation avoidance', 2005
None, 'Analysis of interactions between mismatch repair initiation factors and the replication processivity factor PCNA', 2001
None, 'Exonuclease I of Saccharomyces cerevisiae functions in mitotic recombination in vivo and in vitro', 2000
None, 'Decreased

In [7]:
print(stats)

Total_plan_time=131.50309419631958 
Total_plan_cost=0.09834699999999987 
0. MarshalAndScanDataOp time=0.14904117584228516 cost=0.0 
1. LLMConvertConventional time=29.876057386398315 cost=0.04502249999999999 
2. LLMFilter time=10.838376760482788 cost=0.0175725 
3. LLMConvertConventional time=108.7512903213493 cost=0.035751999999999874 



Since some of the reference extracted from the papers are missing the authors, due to the cheaper model being used, we run the workload again using the `MaxQuality` policy to get more accurate results, at the cost of a higher price.

In [6]:
policy = pz.MaxQuality()

engine = pz.StreamingSequentialExecution

iterable  =  pz.Execute(references,
                        allow_bonded_query=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        policy = policy,
                        execution_engine=engine)

print()
for results, plan, stats in iterable:
    for r in results:
        if r.reference_title:
            print(f"{r.reference_first_author}, '{r.reference_title}', {r.reference_year}", flush=True)

Available models:  [GPT_3_5, GPT_4]

LOGICAL PLANS: 2
INITIAL PLANS: 144
INITIAL PLANS: 80
DEDUP PLANS: 60
PARETO PLANS: 3
Time for planning: 0.3397045135498047
Mimitou, E.P., 'Nucleases and helicases take center stage in homologous recombination', 2009
Shiotani, B., 'Single-stranded DNA orchestrates an ATM-to-ATR switch at DNA breaks', 2009
Sartori, A.A., 'Human CtIP promotes DNA end resection', 2007
Bernstein, K.A., 'At loose ends: resecting a double-strand break', 2009
Szankasi, P., 'A DNA exonuclease induced during meiosis of Schizosaccharomyces pombe', 1992
Lee Bi, B.I., 'Molecular interactions of human Exo1 with DNA', 2002
Qiu, J., 'Human exonuclease 1 functionally complements its yeast homologues in DNA recombination, RNA primer removal, and mutation avoidance', 1999
Lee, S.D., 'Analysis of interactions between mismatch repair initiation factors and the replication processivity factor PCNA', 2006
Kirkpatrick, D.T., 'Decreased meiotic intergenic recombination and increased meiosi

In [10]:
print(stats)

Total_plan_time=225.37218475341797 
Total_plan_cost=0.8608459999999991 
0. MarshalAndScanDataOp time=0.1463642120361328 cost=0.0 
1. BondedQueryConvertStrategy time=23.795532941818237 cost=0.16295 
2. BondedQueryConvertStrategy time=120.77890276908877 cost=0.4626099999999992 
3. LLMFilter time=81.29191517829894 cost=0.23528599999999988 

