In [1]:
import dspy

In [2]:
from typing import Literal

In [3]:
class ResumeExtraction(dspy.Signature):
    """
    Extract the relevant information from text of resume through ocr.
    Output provide the relevant categories if available from the text.
    """
    resume_text: str = dspy.InputField()

    # output
    name: str = dspy.OutputField(desc="who's resume is it?")
    emails: str = dspy.OutputField(desc="personal email")
    phones: list[str] = dspy.OutputField(desc="personal phone number")
    addresses: list[str] = dspy.OutputField(desc="personal address")
    websites: list[str] = dspy.OutputField(desc="related websites")
    social_profiles: dict[str, str] = dspy.OutputField(desc="social media name as key : profile link as value.")

    skills_categorized: dict[str, list[str]] = dspy.OutputField(desc="set of skills extracted from resume and categorised.") 
    experience_insights: list[dict[str, str]] = dspy.OutputField(desc="list of experiences from each institution/organisation/project")
    achievement_metrics: list[dict[str, str]] =  dspy.OutputField(desc="metric infomation from achievements.")
    career_progression: dict[str, str] =  dspy.OutputField(desc="career progress timeline/ duration per position.")
    personality_traits: list[str] =  dspy.OutputField(desc="personality traits observed or mentioned.") 

In [4]:
lm = dspy.LM("ollama_chat/qwen3:4b", api_base='http://localhost:11434', api_key='')

In [5]:
dspy.configure(lm=lm)

In [6]:
module = dspy.Predict(ResumeExtraction)

In [7]:
email = "../output-structured/email_1.md"
resume = "../output-structured/resume_1.md"
scipub = "../output-structured/scipub_1.md"

In [8]:
with open(resume, "r", encoding="utf-8") as f:
    file_text = f.read()

In [9]:
text = file_text

In [10]:
response = module(resume_text=text)

In [11]:
response

Prediction(
    name='Mingming Hao',
    emails='[]',
    phones=[],
    addresses=['Jinan, PRC', 'Houston, TX'],
    websites=[],
    social_profiles={},
    skills_categorized={'Medicine': ['Medicine'], 'Cytogenetics': ['Cytogenetics'], 'Molecular Genetics': ['Molecular Genetics']},
    experience_insights=[{'institution': 'Shandong Medical University', 'position': 'Undergraduate research with Dr. Yishou Guo', 'location': 'Jinan, PRC'}, {'institution': 'University of Texas M.D. Anderson Cancer Center', 'position': 'Graduate research with Dr. Michele Sawadogo', 'location': 'Houston, TX'}],
    achievement_metrics=[{'publication': 'Repeat mole among three sisters from & family', 'year': '1988'}, {'publication': 'A simple method for premetaphase chromosome preparations', 'year': '1989'}, {'publication': 'High-resolution G-banding chromosome analysis of 32 mentally retarded children', 'year': 'In press'}],
    career_progression={'1986-89': 'Undergraduate research with Dr. Yishou Guo', '

In [12]:
from datetime import datetime

In [13]:
class EmailExtraction(dspy.Signature):
    """Enhanced email data needs to be extracted from the email text exctracted via ocr"""
    email_text: str = dspy.InputField(desc="Input data")

    # output
    email_from: str = dspy.OutputField(desc="sender email")
    email_to: list[str] = dspy.OutputField(desc="receipient emails")
    cc: list[str] = dspy.OutputField(desc="receipient emails carbon copy")
    bcc: list[str] = dspy.OutputField(desc="receipient emails blind carbon copy")
    subject: str = dspy.OutputField(desc="email subject")
    date: datetime = dspy.OutputField(desc="email sent date")
    body: str = dspy.OutputField(desc="email body")
    attachments: list[str] = dspy.OutputField(desc="mentioned attachements")

    emotional_tone: dict[str, float] = dspy.OutputField(desc="professional, friendly ... etc.")
    intent_hierarchy: list[dict[str, str]] = dspy.OutputField(desc="primary, secondary and optional intents.")  
    action_items: list[str] = dspy.OutputField()
    stakeholders: list[dict[str, str]] = dspy.OutputField()
    follow_up_required: bool = dspy.OutputField()
    priority_level: Literal['low', 'medium', 'high'] = dspy.OutputField()
    relationship_context: str = dspy.OutputField(desc="what is the email related to ?")

In [14]:
with open(email, "r", encoding="utf-8") as f:
    file_text = f.read()

In [18]:
email_module = dspy.Predict(EmailExtraction)

In [19]:
response = email_module(email_text=file_text)

In [20]:
response

Prediction(
    email_from='John.Hoel@PMMC.com',
    email_to=['Kim Tucker/OAG@OAG'],
    cc=[],
    bcc=[],
    subject="RE: Today's Roll Call",
    date=datetime.datetime(2001, 1, 4, 21, 41, 27),
    body='ImfreengsfinFloriontioructl Foolery.',
    attachments=[],
    emotional_tone={'professional': 0.5, 'friendly': 0.5, 'urgent': 0.5, 'neutral': 0.5},
    intent_hierarchy=[{'intent': 'confirm attendance', 'confidence': 'high'}, {'intent': 'request follow-up', 'confidence': 'medium'}],
    action_items=[],
    stakeholders=[{'name': 'Kim Tucker', 'role': 'recipient'}],
    follow_up_required=False,
    priority_level='high',
    relationship_context='Roll call meeting attendance confirmation'
)

In [21]:
class ScientificPaperExtractaction(dspy.Signature):
    """Extract scientific paper data with LLM insights from the ocr extracted scientific publication data."""
    scipub_text: str = dspy.InputField()

    # output
    title: str = dspy.OutputField(desc="title or best description of the scientific publication.")
    authors: list[str] = dspy.OutputField()
    affiliations: list[str] = dspy.OutputField()
    abstract: str = dspy.OutputField(desc="abstract of the paper")
    keywords: list[str] = dspy.OutputField(desc="most impactful words/tokens in the paper.")
    sections: dict[str, str] = dspy.OutputField(desc="breakdown of the sections in the paper.")
    citations: list[str] = dspy.OutputField()
    references: list[str] = dspy.OutputField()
    figures_tables: list[str] = dspy.OutputField()
    doi: str = dspy.OutputField()
    journal: str = dspy.OutputField()
    publication_date: datetime = dspy.OutputField()

    research_contribution: dict[str, str] = dspy.OutputField(desc="who and what was contributed. provide the who as key and what as value.")
    methodology_type: str = dspy.OutputField()
    research_gaps_identified: list[str] = dspy.OutputField()
    future_work_suggestions: list[str] = dspy.OutputField()

In [22]:
scipub_module = dspy.Predict(ScientificPaperExtractaction)

In [23]:
with open(scipub, "r", encoding="utf-8") as f:
    file_text = f.read()

In [24]:
response = scipub_module(scipub_text=file_text)

In [25]:
# import dspy

# # 1. Configure LMs
# ollama_lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434', api_key='')
# gpt4o = dspy.LM('openai/gpt-4o', api_key='YOUR_OPENAI_API_KEY')
# dspy.configure(lm=ollama_lm)

# # 2. Define your DSPy program
# module = dspy.ChainOfThought("question -> answer")

# # 3. Prepare your data (replace with your actual data)
# trainset = [...]  # List of dspy.Example objects
# devset = [...]    # List of dspy.Example objects

# # 4. Set up the optimizer with teacher
# optimizer = dspy.MIPROv2(
#     metric=lambda x, y, trace=None: x.answer == y.answer,
#     prompt_model=ollama_lm,
#     teacher_settings=dict(lm=gpt4o),
#     auto="medium",
#     num_threads=4
# )

# # 5. Run optimization
# optimized_module = optimizer.compile(
#     module,
#     trainset=trainset,
#     max_bootstrapped_demos=4,
#     max_labeled_demos=4,
#     requires_permission_to_run=False
# )

In [26]:
response

Prediction(
    title='Post-radiolabelling for detecting DNA damage',
    authors=['WilliamP.Watson'],
    affiliations=['ShellRcitdRecr ME9 8AG.UK'],
    abstract='The biochemical and molecular basis of cancer continues to be an expanding area of research, driven by concerns about chemical exposure risks. This paper discusses methods for detecting DNA damage, emphasizing covalent binding of carcinogens to DNA as a critical initiating event in carcinogenesis. It reviews techniques like radiolabelled carcinogen administration, DNA adduct analysis via post-labelling, and the limitations of current methods in human studies, highlighting the need for alternative approaches to assess DNA damage and cancer risks.',
    keywords=['DNA damage', 'carcinogens', 'covalent binding', 'post-radiolabelling', 'DNA adducts', 'mutagenesis', 'cancer risk assessment'],
    sections={'Introduction': 'The biochemical and molecular basis of cancer continues to be an expanding area of research. Much of the dr

In [27]:
class Extractor:
    def __init__(self):
        self.resume_extractor: dspy.Predict = dspy.Predict(ResumeExtraction)
        self.email_extractor: dspy.Predict = dspy.Predict(EmailExtraction)
        self.scipub_extractor: dspy.Predict = dspy.Predict(ScientificPaperExtractaction)

    def get_extraction(self, doc_type: str, doc_text: str):  # pyright: ignore[reportUnknownParameterType]
        match doc_type:
            case "resume":
                ex = self.resume_extractor(resume_text=doc_text)  # pyright: ignore[reportUnknownVariableType]
                return ex  # pyright: ignore[reportUnknownVariableType]
            case "scientific_publication":
                ex = self.scipub_extractor(scipub_text=doc_text)  # pyright: ignore[reportUnknownVariableType]
                return ex  # pyright: ignore[reportUnknownVariableType]
            case "email":
                ex = self.email_extractor(email_text=doc_text)  # pyright: ignore[reportUnknownVariableType]
                return ex  # pyright: ignore[reportUnknownVariableType]
            case _:
                return "Unknown document type."

In [28]:
extractor = Extractor()

In [34]:
resp = extractor.get_extraction(doc_type="scientific_publication", doc_text=file_text)

In [35]:
resp.toDict()

{'title': 'Post-radiolabelling for detecting DNA damage',
 'authors': ['WilliamP.Watson'],
 'affiliations': ['ShellRcitdRecr ME9 8AG.UK'],
 'abstract': 'The biochemical and molecular basis of cancer continues to be an expanding area of research, driven by concerns about chemical exposure risks. This paper discusses methods for detecting DNA damage, emphasizing covalent binding of carcinogens to DNA as a critical initiating event in carcinogenesis. It reviews techniques like radiolabelled carcinogen administration, DNA adduct analysis via post-labelling, and the limitations of current methods in human studies, highlighting the need for alternative approaches to assess DNA damage and cancer risks.',
 'keywords': ['DNA damage',
  'carcinogens',
  'covalent binding',
  'post-radiolabelling',
  'DNA adducts',
  'mutagenesis',
  'cancer risk assessment'],
 'sections': {'Introduction': 'The biochemical and molecular basis of cancer continues to be an expanding area of research. Much of the dr