In [1]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from dataclasses import dataclass, field
import json
import asyncio
from enum import Enum
import dspy

In [2]:
class LLMProvider(Enum):
    OPENAI="openai"
    ANTHROPIC = "anthropic"
    COHERE= "cohere"
    LOCAL = "local"

In [3]:
@dataclass
class EnhancedResumeData:
    """Enhanced resume data with LLM-extracted insights"""
    # SpaCy extracted data
    basic_info: dict[str, any] = field(default_factory=dict)
    
    # LLM enhanced data
    skills_categorized: dict[str, list[str]] = field(default_factory=dict)  
    experience_insights: list[dict[str, any]] = field(default_factory=list)
    achievement_metrics: list[dict[str, any]] = field(default_factory=list)
    career_progression: dict[str, any] = field(default_factory=dict)
    personality_traits: list[str] = field(default_factory=list)
    industry_fit: dict[str, float] = field(default_factory=dict) 

In [4]:
@dataclass
class EnhancedEmailData:
    """Enhanced email data with LLM analysis"""
    # SpaCy extracted data
    basic_structure: dict[str, any] = field(default_factory=dict)
    
    # LLM enhanced data
    emotional_tone: dict[str, float] = field(default_factory=dict)  # professional, friendly, urgent, etc.
    intent_hierarchy: list[dict[str, any]] = field(default_factory=list)  # primary, secondary intents
    action_items: list[str] = field(default_factory=list)
    stakeholders: list[dict[str, any]] = field(default_factory=list)
    follow_up_required: bool = False
    priority_level: str = "medium"
    relationship_context: str = ""

In [5]:
@dataclass
class EnhancedScientificData:
    """Enhanced scientific paper data with LLM insights"""
    # SpaCy extracted data
    basic_structure: dict[str, any] = field(default_factory=dict)
    
    # LLM enhanced data
    research_contribution: dict[str, any] = field(default_factory=dict)
    methodology_type: str = ""
    novelty_assessment: dict[str, float] = field(default_factory=dict)
    research_gaps_identified: list[str] = field(default_factory=list)
    future_work_suggestions: list[str] = field(default_factory=list)
    interdisciplinary_connections: list[str] = field(default_factory=list)
    reproducibility_score: float = 0.0

In [6]:
@dataclass
class ContactInfo:
    """Structure for contact information"""
    emails: list[str] = field(default_factory=list)
    phones: list[str] = field(default_factory=list)
    addresses: list[str] = field(default_factory=list)
    websites: list[str] = field(default_factory=list)
    social_profiles: dict[str, str] = field(default_factory=dict)

In [7]:
from datetime import datetime
from collections import defaultdict

In [8]:
@dataclass
class ResumeData:
    """Structured resume data"""
    personal_info: ContactInfo = field(default_factory=ContactInfo)
    summary: str = ""
    skills: list[str] = field(default_factory=list)
    experience: list[dict[str, any]] = field(default_factory=list)
    education: list[dict[str, any]] = field(default_factory=list)
    certifications: list[str] = field(default_factory=list)
    languages: list[str] = field(default_factory=list)

@dataclass
class EmailData:
    """Structured email data"""
    sender: str = ""
    recipients: list[str] = field(default_factory=list)
    cc: list[str] = field(default_factory=list)
    bcc: list[str] = field(default_factory=list)
    subject: str = ""
    date: datetime | None = None
    body: str = ""
    attachments: list[str] = field(default_factory=list)
    sentiment: str = ""
    intent: str = ""
    entities: list[dict[str, any]] = field(default_factory=list)

In [18]:
@dataclass
class ScientificPaperData:
    """Structured scientific paper data"""
    title: str = ""
    authors: list[str] = field(default_factory=list)
    affiliations: list[str] = field(default_factory=list)
    abstract: str = ""
    keywords: list[str] = field(default_factory=list)
    sections: dict[str, str] = field(default_factory=dict)
    citations: list[str] = field(default_factory=list)
    references: list[str] = field(default_factory=list)
    figures_tables: list[str] = field(default_factory=list)
    doi: str = ""
    journal: str = ""
    publication_date: datetime | None = None


In [9]:
from spacy.language import Language

In [10]:
@Language.component("custom_entity_ruler")
def custom_entity_ruler(doc):
    """Custom component for domain-specific entity recognition"""
    return doc

In [11]:
import re

In [12]:
from spacy.tokens import Doc, Span

In [13]:
class DocumentParser:
    """Base class for document parsing using SpaCy building blocks"""
    
    def __init__(self, model_name: str = "en_core_web_sm"):
        """Initialize the parser with SpaCy components"""
        # Load Language model
        self.nlp = spacy.load(model_name)
        
        # Initialize matchers
        self.matcher = Matcher(self.nlp.vocab)
        self.phrase_matcher = PhraseMatcher(self.nlp.vocab)
        
        # Setup custom patterns and rules
        self._setup_patterns()

        # Add custom pipeline components
        if "custom_entity_ruler" not in self.nlp.pipe_names:
            self.nlp.add_pipe("custom_entity_ruler", last=True)
    
    def _setup_patterns(self):
        """Setup common patterns for all document types"""
        # Email patterns
        email_pattern = [{"LIKE_EMAIL": True}]
        self.matcher.add("EMAIL", [email_pattern])
        
        # Phone patterns
        phone_patterns = [
            [{"SHAPE": "ddd-ddd-dddd"}],
            [{"SHAPE": "(ddd) ddd-dddd"}],
            [{"TEXT": {"REGEX": r"\+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}"}}]
        ]
        self.matcher.add("PHONE", phone_patterns)
        
        # URL patterns
        url_pattern = [{"LIKE_URL": True}]
        self.matcher.add("URL", [url_pattern])
        
        # Date patterns
        date_patterns = [
            [{"SHAPE": "dd/dd/dddd"}],
            [{"SHAPE": "dd-dd-dddd"}],
            [{"ENT_TYPE": "DATE"}]
        ]
        self.matcher.add("DATE", date_patterns)

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters that might interfere with parsing
        text = re.sub(r'[^\w\s@.-]', ' ', text)
        return text.strip()
    
    def extract_entities(self, doc: Doc) -> list[dict[str, any]]:
        """Extract named entities using SpaCy's EntityRecognizer"""
        entities = []
        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char,
                'confidence': getattr(ent, 'kb_id_', 0.0)
            })
        return entities
    
    def extract_contact_info(self, doc: Doc) -> ContactInfo:
        """Extract contact information using matchers"""
        contact = ContactInfo()
        
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            span = doc[start:end]
            
            if label == "EMAIL":
                contact.emails.append(span.text)
            elif label == "PHONE":
                contact.phones.append(span.text)
            elif label == "URL":
                contact.websites.append(span.text)
        
        return contact

In [15]:
class ResumeParser(DocumentParser):
    """Specialized parser for resumes"""
    
    def __init__(self, model_name: str = "en_core_web_sm"):
        super().__init__(model_name)
        self._setup_resume_patterns()
    
    def _setup_resume_patterns(self):
        """Setup resume-specific patterns"""
        # Skills section patterns
        skills_headers = ["skills", "technical skills", "core competencies", "expertise"]
        skills_patterns = [[{"LOWER": {"IN": skills_headers}}]]
        self.matcher.add("SKILLS_HEADER", skills_patterns)
        
        # Experience section patterns
        exp_headers = ["experience", "work experience", "employment", "professional experience"]
        exp_patterns = [[{"LOWER": {"IN": exp_headers}}]]
        self.matcher.add("EXPERIENCE_HEADER", exp_patterns)
        
        # Education section patterns
        edu_headers = ["education", "academic background", "qualifications"]
        edu_patterns = [[{"LOWER": {"IN": edu_headers}}]]
        self.matcher.add("EDUCATION_HEADER", edu_patterns)
        
        # Job title patterns
        job_titles = ["manager", "developer", "engineer", "analyst", "director", "specialist"]
        job_patterns = [[{"LOWER": {"IN": job_titles}}]]
        self.matcher.add("JOB_TITLE", job_patterns)
    
    def parse(self, text: str) -> ResumeData:
        """Parse resume text and extract structured data"""
        text = self.preprocess_text(text)
        doc = self.nlp(text)
        
        resume_data = ResumeData()
        
        # Extract contact information
        resume_data.personal_info = self.extract_contact_info(doc)
        
        # Extract sections using sentence segmentation
        sections = self._identify_sections(doc)
        
        # Parse each section
        for section_name, section_text in sections.items():
            section_doc = self.nlp(section_text)
            
            if "skill" in section_name.lower():
                resume_data.skills = self._extract_skills(section_doc)
            elif "experience" in section_name.lower():
                resume_data.experience = self._extract_experience(section_doc)
            elif "education" in section_name.lower():
                resume_data.education = self._extract_education(section_doc)
            elif "summary" in section_name.lower() or "objective" in section_name.lower():
                resume_data.summary = section_text
        
        return resume_data
    
    def _identify_sections(self, doc: Doc) -> dict[str, str]:
        """Use dependency parsing and pattern matching to identify resume sections"""
        sections = {}
        matches = self.matcher(doc)
        
        section_starts = []
        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            if "_HEADER" in label:
                section_starts.append((start, label.replace("_HEADER", "").lower(), end))
        
        # Sort by position
        section_starts.sort(key=lambda x: x[0])
        
        # Extract text between section headers
        for i, (start, section_name, end) in enumerate(section_starts):
            next_start = section_starts[i + 1][0] if i + 1 < len(section_starts) else len(doc)
            section_text = doc[end:next_start].text
            sections[section_name] = section_text
        
        return sections
    
    def _extract_skills(self, doc: Doc) -> list[str]:
        """Extract skills using NER and POS tagging"""
        skills = []
        
        # Extract noun phrases as potential skills
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) <= 3:  # Short phrases likely to be skills
                skills.append(chunk.text)
        
        # Extract technical terms (proper nouns, specific patterns)
        for token in doc:
            if token.pos_ == "PROPN" and not token.ent_type_:
                skills.append(token.text)
        
        return list(set(skills))  # Remove duplicates
    
    def _extract_experience(self, doc: Doc) -> list[dict[str, any]]:
        """Extract work experience using NER and pattern matching"""
        experiences = []
        
        # Use sentence boundaries to separate different experiences
        for sent in doc.sents:
            exp_entry = {}
            
            # Extract organizations
            orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG"]
            if orgs:
                exp_entry["company"] = orgs[0]
            
            # Extract dates
            dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]
            if dates:
                exp_entry["duration"] = dates
            
            # Extract job titles using pattern matching
            matches = self.matcher(sent)
            for match_id, start, end in matches:
                if self.nlp.vocab.strings[match_id] == "JOB_TITLE":
                    exp_entry["title"] = sent[start:end].text
            
            if exp_entry:
                exp_entry["description"] = sent.text
                experiences.append(exp_entry)
        
        return experiences
    
    def _extract_education(self, doc: Doc) -> list[dict[str, any]]:
        """Extract education information"""
        education = []
        
        for sent in doc.sents:
            edu_entry = {}
            
            # Extract organizations (schools/universities)
            orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG"]
            if orgs:
                edu_entry["institution"] = orgs[0]
            
            # Extract dates
            dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]
            if dates:
                edu_entry["graduation_date"] = dates[0]
            
            if edu_entry:
                edu_entry["description"] = sent.text
                education.append(edu_entry)
        
        return education

In [16]:
class EmailParser(DocumentParser):
    """Specialized parser for emails"""
    
    def __init__(self, model_name: str = "en_core_web_sm"):
        super().__init__(model_name)
        self._setup_email_patterns()
    
    def _setup_email_patterns(self):
        """Setup email-specific patterns"""
        # Email header patterns
        header_patterns = [
            [{"LOWER": "from"}, {"TEXT": ":"}],
            [{"LOWER": "to"}, {"TEXT": ":"}],
            [{"LOWER": "subject"}, {"TEXT": ":"}],
            [{"LOWER": "date"}, {"TEXT": ":"}]
        ]
        self.matcher.add("EMAIL_HEADER", header_patterns)
        
        # Greeting patterns
        greetings = ["dear", "hello", "hi", "greetings"]
        greeting_patterns = [[{"LOWER": {"IN": greetings}}]]
        self.matcher.add("GREETING", greeting_patterns)
        
        # Closing patterns
        closings = ["sincerely", "regards", "best", "thanks", "thank you"]
        closing_patterns = [[{"LOWER": {"IN": closings}}]]
        self.matcher.add("CLOSING", closing_patterns)
    
    def parse(self, text: str) -> EmailData:
        """Parse email text and extract structured data"""
        text = self.preprocess_text(text)
        doc = self.nlp(text)
        
        email_data = EmailData()
        
        # Extract header information
        self._extract_headers(text, email_data)
        
        # Extract body content
        email_data.body = self._extract_body(doc)
        
        # Extract entities
        email_data.entities = self.extract_entities(doc)
        
        # Analyze sentiment using TextCategorizer (if available)
        email_data.sentiment = self._analyze_sentiment(doc)
        
        # Determine intent
        email_data.intent = self._determine_intent(doc)
        
        return email_data
    
    def _extract_headers(self, text: str, email_data: EmailData):
        """Extract email headers using regex patterns"""
        # Simple regex patterns for email headers
        from_match = re.search(r'From:\s*(.+)', text, re.IGNORECASE)
        if from_match:
            email_data.sender = from_match.group(1).strip()
        
        to_match = re.search(r'To:\s*(.+)', text, re.IGNORECASE)
        if to_match:
            email_data.recipients = [r.strip() for r in to_match.group(1).split(',')]
        
        subject_match = re.search(r'Subject:\s*(.+)', text, re.IGNORECASE)
        if subject_match:
            email_data.subject = subject_match.group(1).strip()
    
    def _extract_body(self, doc: Doc) -> str:
        """Extract email body content, removing headers"""
        # Find the start of the actual content (after headers)
        body_start = 0
        for i, token in enumerate(doc):
            if token.text.lower() in ['dear', 'hello', 'hi'] or token.like_email:
                body_start = i
                break
        
        return doc[body_start:].text
    
    def _analyze_sentiment(self, doc: Doc) -> str:
        """Analyze email sentiment (simplified version)"""
        positive_words = ['thank', 'great', 'excellent', 'good', 'appreciate']
        negative_words = ['sorry', 'problem', 'issue', 'urgent', 'disappointed']
        
        pos_count = sum(1 for token in doc if token.text.lower() in positive_words)
        neg_count = sum(1 for token in doc if token.text.lower() in negative_words)
        
        if pos_count > neg_count:
            return "positive"
        elif neg_count > pos_count:
            return "negative"
        else:
            return "neutral"
    
    def _determine_intent(self, doc: Doc) -> str:
        """Determine email intent using pattern matching"""
        question_indicators = ['?', 'how', 'what', 'when', 'where', 'why', 'can you']
        request_indicators = ['please', 'could you', 'would you', 'need']
        
        text_lower = doc.text.lower()
        
        if any(indicator in text_lower for indicator in question_indicators):
            return "inquiry"
        elif any(indicator in text_lower for indicator in request_indicators):
            return "request"
        else:
            return "informational"

In [19]:
class ScientificPaperParser(DocumentParser):
    """Specialized parser for scientific publications"""
    
    def __init__(self, model_name: str = "en_core_web_sm"):
        super().__init__(model_name)
        self._setup_scientific_patterns()
    
    def _setup_scientific_patterns(self):
        """Setup scientific paper-specific patterns"""
        # Section headers
        section_headers = [
            "abstract", "introduction", "methodology", "methods", "results", 
            "discussion", "conclusion", "references", "acknowledgments"
        ]
        section_patterns = [[{"LOWER": {"IN": section_headers}}]]
        self.matcher.add("SECTION_HEADER", section_patterns)
        
        # Citation patterns
        citation_patterns = [
            [{"TEXT": "("}, {"LIKE_NUM": True}, {"TEXT": ")"}],
            [{"TEXT": "["}, {"LIKE_NUM": True}, {"TEXT": "]"}]
        ]
        self.matcher.add("CITATION", citation_patterns)
        
        # Figure/Table references
        fig_table_patterns = [
            [{"LOWER": "figure"}, {"LIKE_NUM": True}],
            [{"LOWER": "table"}, {"LIKE_NUM": True}],
            [{"LOWER": "fig"}, {"TEXT": "."}, {"LIKE_NUM": True}]
        ]
        self.matcher.add("FIG_TABLE", fig_table_patterns)
    
    def parse(self, text: str) -> ScientificPaperData:
        """Parse scientific paper and extract structured data"""
        text = self.preprocess_text(text)
        doc = self.nlp(text)
        
        paper_data = ScientificPaperData()
        
        # Extract title (usually first sentence or line)
        paper_data.title = self._extract_title(doc)
        
        # Extract authors and affiliations
        paper_data.authors, paper_data.affiliations = self._extract_authors_affiliations(doc)
        
        # Extract sections
        paper_data.sections = self._extract_sections(doc)
        
        # Extract abstract
        if "abstract" in paper_data.sections:
            paper_data.abstract = paper_data.sections["abstract"]
        
        # Extract citations and references
        paper_data.citations = self._extract_citations(doc)
        paper_data.references = self._extract_references(doc)
        
        # Extract figures and tables
        paper_data.figures_tables = self._extract_figures_tables(doc)
        
        # Extract keywords using NER and noun phrases
        paper_data.keywords = self._extract_keywords(doc)
        
        return paper_data
    
    def _extract_title(self, doc: Doc) -> str:
        """Extract paper title (typically the first sentence)"""
        # Find the first sentence that's likely a title
        for sent in doc.sents:
            if len(sent.text.split()) > 3 and not sent.text.lower().startswith(('abstract', 'keywords')):
                return sent.text.strip()
        return ""
    
    def _extract_authors_affiliations(self, doc: Doc) -> tuple[list[str], list[str]]:
        """Extract authors and their affiliations using NER"""
        authors = []
        affiliations = []
        
        # Look for person entities in the first few sentences
        for i, sent in enumerate(doc.sents):
            if i > 5:  # Authors usually appear early
                break
            
            for ent in sent.ents:
                if ent.label_ == "PERSON":
                    authors.append(ent.text)
                elif ent.label_ == "ORG":
                    affiliations.append(ent.text)
        
        return list(set(authors)), list(set(affiliations))
    
    def _extract_sections(self, doc: Doc) -> dict[str, str]:
        """Extract paper sections using pattern matching"""
        sections = {}
        matches = self.matcher(doc)
        
        section_starts = []
        for match_id, start, end in matches:
            if self.nlp.vocab.strings[match_id] == "SECTION_HEADER":
                section_name = doc[start:end].text.lower()
                section_starts.append((start, section_name))
        
        # Sort by position and extract content
        section_starts.sort(key=lambda x: x[0])
        
        for i, (start, section_name) in enumerate(section_starts):
            next_start = section_starts[i + 1][0] if i + 1 < len(section_starts) else len(doc)
            section_content = doc[start:next_start].text
            sections[section_name] = section_content
        
        return sections
    
    def _extract_citations(self, doc: Doc) -> list[str]:
        """Extract in-text citations"""
        citations = []
        matches = self.matcher(doc)
        
        for match_id, start, end in matches:
            if self.nlp.vocab.strings[match_id] == "CITATION":
                citations.append(doc[start:end].text)
        
        return list(set(citations))
    
    def _extract_references(self, doc: Doc) -> list[str]:
        """Extract reference list (simplified)"""
        references = []
        
        # Look for references section
        if "references" in self._extract_sections(doc):
            ref_section = self._extract_sections(doc)["references"]
            ref_doc = self.nlp(ref_section)
            
            # Each sentence in references section is likely a reference
            for sent in ref_doc.sents:
                if len(sent.text.strip()) > 20:  # Filter out short lines
                    references.append(sent.text.strip())
        
        return references
    
    def _extract_figures_tables(self, doc: Doc) -> list[str]:
        """Extract figure and table references"""
        fig_tables = []
        matches = self.matcher(doc)
        
        for match_id, start, end in matches:
            if self.nlp.vocab.strings[match_id] == "FIG_TABLE":
                fig_tables.append(doc[start:end].text)
        
        return list(set(fig_tables))
    
    def _extract_keywords(self, doc: Doc) -> list[str]:
        """Extract keywords using noun phrases and NER"""
        keywords = []
        
        # Extract important noun phrases
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) <= 3 and chunk.root.pos_ in ["NOUN", "PROPN"]:
                keywords.append(chunk.text.lower())
        
        # Extract technical terms (entities not recognized as standard types)
        for ent in doc.ents:
            if ent.label_ in ["PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"]:
                keywords.append(ent.text.lower())
        
        return list(set(keywords))

In [20]:
def create_parser(document_type: str) -> DocumentParser:
    """Factory function to create appropriate parser based on document type"""
    parsers = {
        "resume": ResumeParser,
        "email": EmailParser,
        "scientific": ScientificPaperParser
    }
    
    if document_type not in parsers:
        raise ValueError(f"Unknown document type: {document_type}")
    
    return parsers[document_type]()

In [33]:
email = "../output-structured/email_1.md"
resume = "../output-structured/resume_2.md"
scipub = "../output-structured/scipub_1.md"

In [38]:
with open(scipub, "r", encoding="utf-8") as f:
    file_text = f.read()

In [39]:
file_text

'# Post-radiolabelling for detecting DNA damage \n\n##### Mutagenesks vol.2 no.5 pp.319–331,1987\n\n## REVIEW \n\n### WilliamP.Watson \n\nShellRcitdRecr ME9 8AG.UK \n\n\n\n## I ntroduction \n\nThe biochemical and molecular basis of cancer continues to be an expanding area of research. Much of the driving force behind this important multi-disciplinary area is due to concerns about increasing human healthrisks caused by exposure to toxic chemicalsofbothindustrial andnaturalorigin.Insomeinsances the aetiological agents.for example vinyl chloride.as a cause of angiosarcomas in human liver,and tobaccosmoke asthemajor cause of lung cancer.have been identified byclinical and epidemiological studies involvinghigh-exposure poplations.There is.howeveran increasing nced for reliable prospective methods for predicting human cancer risks as a result ofchemical exposures.Thesomaticmutaionthoryofcancersuggests tat cancer is caused by genetic damage induced by chemicals or viruses. Such mutations resu

In [30]:
resume_parser = create_parser("resume")

In [40]:
scipub_parser = create_parser("scientific")

In [26]:
email_data = resume_parser.parse(file_text)

In [36]:
resume_data = resume_parser.parse(file_text)

In [41]:
scipub_data = resume_parser.parse(file_text)

In [27]:
email_data

EmailData(sender='', recipients=[], cc=[], bcc=[], subject='', date=None, body='John.Hoel@PMMC.comon 01 04 200109 41 27 PM To  Kim Tucker OAG@OAG cc Subject  RE  Today s Roll Call ImfreengsfinFloriontioructl Foolery.', attachments=[], sentiment='neutral', intent='informational', entities=[{'text': '01 04', 'label': 'DATE', 'start': 21, 'end': 26, 'confidence': ''}, {'text': '200109', 'label': 'DATE', 'start': 27, 'end': 33, 'confidence': ''}, {'text': '41 27 PM', 'label': 'TIME', 'start': 34, 'end': 42, 'confidence': ''}, {'text': 'Kim Tucker', 'label': 'PERSON', 'start': 47, 'end': 57, 'confidence': ''}])

In [37]:
resume_data

ResumeData(personal_info=ContactInfo(emails=[], phones=['1966', '1971', '1971', '1974', '1967', '1970Reverend', '1970', '1973Fellow', '1989', '1992Academic', '1990', '1991', '1992Chairman', '1986', '1990', '1988', '1990Editor', '1983', '1991', '3715', '3719.Yen', '1991', '5077', '5081', '1991', '46264630'], addresses=[], websites=[], social_profiles={}), summary='', skills=[], experience=[], education=[], certifications=[], languages=[])

In [42]:
scipub_data

ResumeData(personal_info=ContactInfo(emails=[], phones=['1987', '1981', 'Swenberg.1978.genere', 'Schimke.1982', 'Lavi1982', '1981', '1981', '1982', '1979', '1982', 'K.Randerathetal.1981', '1984a', '1985a', '1982', '1980', '1980', '1981', 'Sanger.1981', '1983'], addresses=[], websites=['agents.for', 'chloride.as', 'cancer.have', 'is.howeveran', 'which.has', 'Swenberg.1978.genere', 'carcinogenesis.However.the', 'carcinogens.in', 'animal.However.this', 'thehydrolysateonh.plc.withappropriatesynteticrerence', 'polycyclicaromatichydrocarbons.aromaticaminesandmycotoxinshave', 'can.arise', 'ofh.p.l.c.has'], social_profiles={}), summary='', skills=[], experience=[], education=[], certifications=[], languages=[])

In [43]:
text_ex = """
John Doe
    Software Engineer
    Email: john.doe@email.com
    Phone: (555) 123-4567
    
    SUMMARY
    Experienced software engineer with 5 years in web development.
    
    SKILLS
    Python, JavaScript, React, Node.js, SQL, Git
    
    EXPERIENCE
    Senior Developer at Tech Corp (2020-2023)
    - Led development of web applications
    - Managed team of 3 developers
    
    EDUCATION
    BS Computer Science, State University (2018)
"""


resume_parser = create_parser("resume")
resume_data = resume_parser.parse(text_ex)

In [44]:
resume_data

ResumeData(personal_info=ContactInfo(emails=['john.doe@email.com'], phones=['4567', '2020', '2023', '2018'], addresses=[], websites=['Node.js'], social_profiles={}), summary='', skills=['Python', 'Git'], experience=[{'company': 'Tech Corp  ', 'duration': ['2020-2023'], 'title': 'Developer', 'description': 'Senior Developer at Tech Corp  2020-2023  - Led development of web applications - Managed team of 3 developers'}], education=[{'institution': 'BS Computer Science  State University  ', 'graduation_date': '2018', 'description': 'BS Computer Science  State University  2018'}], certifications=[], languages=[])

In [45]:
class LLMEnhancer:
    """LLM integration for semantic understanding and advanced extraction"""
    
    def __init__(self, provider: LLMProvider = LLMProvider.LOCAL, model_name: str = "llama3"):
        self.provider = provider
        self.model_name = model_name
        self._setup_client()
    
    def _setup_client(self):
        """Setup LLM client based on provider"""
        if self.provider == LLMProvider.LOCAL:
            # For local models like Ollama
            self.client = None  # Initialize your local LLM client
        elif self.provider == LLMProvider.OPENAI:
            # import openai
            # self.client = openai.OpenAI(api_key="your-key")
            pass
        # Add other providers as needed
    
    async def analyze_with_llm(self, text: str, prompt_template: str, schema: dict = None) -> dict[str, any]:
        """Generic LLM analysis method"""
        # This is a placeholder - implement based on your LLM provider
        # For demonstration purposes
        
        prompt = prompt_template.format(text=text)
        
        # Mock response structure - replace with actual LLM call
        if "resume" in prompt_template.lower():
            return self._mock_resume_analysis()
        elif "email" in prompt_template.lower():
            return self._mock_email_analysis()
        elif "scientific" in prompt_template.lower():
            return self._mock_scientific_analysis()
        
        return {}
    
    def _mock_resume_analysis(self) -> dict[str, any]:
        """Mock LLM resume analysis - replace with actual implementation"""
        return {
            "skills_categorized": {
                "technical": ["Python", "Machine Learning", "SQL"],
                "soft": ["Leadership", "Communication", "Problem Solving"],
                "domain": ["Healthcare", "Fintech"]
            },
            "achievement_metrics": [
                {"achievement": "Increased system performance", "metric": "40%", "impact": "high"}
            ],
            "career_progression": {
                "trend": "upward",
                "leadership_growth": True,
                "technical_depth": "increasing"
            }
        }
    
    def _mock_email_analysis(self) -> dict[str, any]:
        """Mock LLM email analysis"""
        return {
            "emotional_tone": {"professional": 0.8, "urgent": 0.3, "friendly": 0.6},
            "intent_hierarchy": [
                {"intent": "request_information", "confidence": 0.9},
                {"intent": "schedule_meeting", "confidence": 0.7}
            ],
            "action_items": ["Provide quarterly report", "Schedule follow-up meeting"]
        }
    
    def _mock_scientific_analysis(self) -> dict[str, any]:
        """Mock LLM scientific paper analysis"""
        return {
            "research_contribution": {
                "type": "methodological",
                "novelty": "moderate",
                "significance": "high"
            },
            "methodology_type": "experimental",
            "research_gaps_identified": ["Limited sample size", "Geographic bias"]
        }

In [50]:
class HybridDocumentParser:
    """Hybrid parser combining SpaCy efficiency with LLM intelligence"""
    
    def __init__(self, use_llm: bool = True, llm_provider: LLMProvider = LLMProvider.LOCAL):
        # Initialize SpaCy components
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)
        self.phrase_matcher = PhraseMatcher(self.nlp.vocab)
        
        # Initialize LLM enhancer
        self.use_llm = use_llm
        if use_llm:
            self.llm_enhancer = LLMEnhancer(provider=llm_provider)
        
        self._setup_patterns()
    
    def _setup_patterns(self):
        """Setup SpaCy patterns for fast extraction"""
        # Email patterns
        email_pattern = [{"LIKE_EMAIL": True}]
        self.matcher.add("EMAIL", [email_pattern])
        
        # Phone patterns
        phone_patterns = [
            [{"SHAPE": "ddd-ddd-dddd"}],
            [{"SHAPE": "(ddd) ddd-dddd"}]
        ]
        self.matcher.add("PHONE", phone_patterns)
        
        # Skills patterns (common technical terms)
        tech_skills = ["python", "javascript", "sql", "react", "kubernetes", "aws"]
        skill_patterns = [self.nlp(skill) for skill in tech_skills]
        self.phrase_matcher.add("TECH_SKILLS", skill_patterns)
    
    async def parse_resume(self, text: str, enhanced: bool = True) -> dict | EnhancedResumeData:
        """Parse resume with optional LLM enhancement"""
        # Phase 1: Fast SpaCy extraction
        doc = self.nlp(text)
        basic_data = self._extract_basic_resume_data(doc)
        
        if not enhanced or not self.use_llm:
            return basic_data
        
        # Phase 2: LLM enhancement for complex understanding
        llm_insights = await self._enhance_resume_with_llm(text, basic_data)
        
        return EnhancedResumeData(
            basic_info=basic_data,
            **llm_insights
        )
    
    async def parse_email(self, text: str, enhanced: bool = True) -> dict | EnhancedEmailData:
        """Parse email with optional LLM enhancement"""
        # Phase 1: SpaCy structure extraction
        doc = self.nlp(text)
        basic_data = self._extract_basic_email_data(doc)
        
        if not enhanced or not self.use_llm:
            return basic_data
        
        # Phase 2: LLM semantic analysis
        llm_insights = await self._enhance_email_with_llm(text, basic_data)
        
        return EnhancedEmailData(
            basic_structure=basic_data,
            **llm_insights
        )
    
    async def parse_scientific_paper(self, text: str, enhanced: bool = True) -> dict | EnhancedScientificData:
        """Parse scientific paper with optional LLM enhancement"""
        # Phase 1: SpaCy structural extraction
        doc = self.nlp(text)
        basic_data = self._extract_basic_scientific_data(doc)
        
        if not enhanced or not self.use_llm:
            return basic_data
        
        # Phase 2: LLM research intelligence
        llm_insights = await self._enhance_scientific_with_llm(text, basic_data)
        
        return EnhancedScientificData(
            basic_structure=basic_data,
            **llm_insights
        )
    
    def _extract_basic_resume_data(self, doc) -> dict[str, any]:
        """Fast SpaCy-based resume extraction"""
        data = {
            "emails": [],
            "phones": [],
            "skills": [],
            "organizations": [],
            "dates": []
        }
        
        # Pattern matching for structured data
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            span_text = doc[start:end].text
            
            if label == "EMAIL":
                data["emails"].append(span_text)
            elif label == "PHONE":
                data["phones"].append(span_text)
        
        # Entity extraction
        for ent in doc.ents:
            if ent.label_ == "ORG":
                data["organizations"].append(ent.text)
            elif ent.label_ == "DATE":
                data["dates"].append(ent.text)
        
        # Phrase matching for skills
        phrase_matches = self.phrase_matcher(doc)
        for match_id, start, end in phrase_matches:
            if self.nlp.vocab.strings[match_id] == "TECH_SKILLS":
                data["skills"].append(doc[start:end].text)
        
        return data
    
    def _extract_basic_email_data(self, doc) -> dict[str, any]:
        """Fast SpaCy-based email extraction"""
        return {
            "entities": [{"text": ent.text, "label": ent.label_} for ent in doc.ents],
            "sentences": [sent.text for sent in doc.sents],
            "emails": [token.text for token in doc if token.like_email],
            "word_count": len([token for token in doc if not token.is_space])
        }
    
    def _extract_basic_scientific_data(self, doc) -> dict[str, any]:
        """Fast SpaCy-based scientific paper extraction"""
        return {
            "entities": [{"text": ent.text, "label": ent.label_} for ent in doc.ents],
            "noun_phrases": [chunk.text for chunk in doc.noun_chunks],
            "sentences": [sent.text for sent in doc.sents[:10]],  # First 10 sentences
            "organizations": [ent.text for ent in doc.ents if ent.label_ == "ORG"],
            "people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        }
    
    async def _enhance_resume_with_llm(self, text: str, basic_data: dict) -> dict[str, any]:
        """Use LLM for advanced resume analysis"""
        
        resume_prompt = """
        Analyze this resume text and provide insights in JSON format:
        
        Resume Text: {text}
        
        Please categorize skills, extract achievement metrics, assess career progression,
        and identify personality traits. Focus on semantic understanding and context.
        
        Return structured JSON with skills_categorized, achievement_metrics, career_progression fields.
        """
        
        return await self.llm_enhancer.analyze_with_llm(text, resume_prompt)
    
    async def _enhance_email_with_llm(self, text: str, basic_data: dict) -> dict[str, any]:
        """Use LLM for advanced email analysis"""
        
        email_prompt = """
        Analyze this email for emotional tone, intent hierarchy, and action items:
        
        Email Text: {text}
        
        Provide emotional tone scores, ranked intents, extracted action items,
        and assess urgency and relationship context.
        
        Return JSON with emotional_tone, intent_hierarchy, action_items fields.
        """
        
        return await self.llm_enhancer.analyze_with_llm(text, email_prompt)
    
    async def _enhance_scientific_with_llm(self, text: str, basic_data: dict) -> dict[str, any]:
        """Use LLM for advanced scientific paper analysis"""
        
        scientific_prompt = """
        Analyze this scientific paper excerpt for research contribution and methodology:
        
        Paper Text: {text}
        
        Assess the research contribution type, methodology, novelty, and identify
        research gaps and future work suggestions.
        
        Return JSON with research_contribution, methodology_type, research_gaps_identified fields.
        """
        
        return await self.llm_enhancer.analyze_with_llm(text, scientific_prompt)

In [64]:
async def choose_strategy(mode: str, file_text: str, parser: HybridDocumentParser) -> str:
    """
    Choose parsing strategy based on requirements:
    """
        
    match mode:
        case "quick":
            parsed_data = await parser.parse_resume(file_text, enhanced=False)
            return parsed_data
            
        case "enhanced":
            try:
                parsed_data = await parser.parse_resume(file_text, enhanced=True)
                return parsed_data
            except Exception as e:
                print(f"exception occured {e}")
                parsed_data = await parser.parse_resume(file_text, enhanced=False)
                return parsed_data
            
        case _:
            raise Exception("UnSupported mode used")

In [52]:
# Initialize hybrid parser
parser = HybridDocumentParser(use_llm=True)

In [54]:
# Strategy 1: SpaCy only (fast, basic)
basic_result = await parser.parse_resume(file_text, enhanced=False)
print("Basic extraction:", basic_result)

Basic extraction: {'emails': [], 'phones': [], 'skills': [], 'organizations': ['formationofDNAadductsbyindirectmechanismscanalsooccur', 'DNAsuch', 'mutagen', 'carcinogen', 'deoxyribonucleoside3'], 'dates': ['Several years ago', '1981']}


In [59]:
# Strategy 2: Full LLM enhancement (slow, comprehensive)
enhanced_result = await parser.parse_resume(file_text, enhanced=True)
print("Enhanced extraction:", enhanced_result.skills_categorized)

Enhanced extraction: {'technical': ['Python', 'Machine Learning', 'SQL'], 'soft': ['Leadership', 'Communication', 'Problem Solving'], 'domain': ['Healthcare', 'Fintech']}


In [69]:
strategy = await choose_strategy(
    mode="enhanced",
    file_text=file_text,
    parser=parser
)
print(f"Recommended strategy: {strategy}")

Recommended strategy: EnhancedResumeData(basic_info={'emails': [], 'phones': [], 'skills': [], 'organizations': ['formationofDNAadductsbyindirectmechanismscanalsooccur', 'DNAsuch', 'mutagen', 'carcinogen', 'deoxyribonucleoside3'], 'dates': ['Several years ago', '1981']}, skills_categorized={'technical': ['Python', 'Machine Learning', 'SQL'], 'soft': ['Leadership', 'Communication', 'Problem Solving'], 'domain': ['Healthcare', 'Fintech']}, experience_insights=[], achievement_metrics=[{'achievement': 'Increased system performance', 'metric': '40%', 'impact': 'high'}], career_progression={'trend': 'upward', 'leadership_growth': True, 'technical_depth': 'increasing'}, personality_traits=[], industry_fit={})
