In [2]:
import os
from pprint import pprint
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger

class DocumentPropertyExtractor:
    def __init__(self, file_name):
        self.file_name = file_name

    def extract_properties(self):
        allowed_extensions = ('.pdf', '.doc', '.docx', '.txt')
 
        schema = {
            "properties": {
                "Name": {
                    "type":"string",
                    "description": "The person's full name"
                    },
                "Email": {
                    "type":"string",
                    "description": "The person's email address"
                },
                "Phone": {
                    "type":"string",
                    "description": "The person's phone number"
                },
                "Address": {
                    "type":"string",
                    "description": "The address the person mentions",
                    "examples":( 
                        "27 Jalan Pandan 3/9, Pandan Jaya, Kuala Lumpur, Malaysia.",
                        ["Kuala Lumpur, Malaysia."]
                    )
                },
                "City": {
                    "type":"string",
                    "description": "The person's city"
                },
                "State": {
                    "type":"string",
                    "description": "The person's state"
                },
                "Country": {
                    "type":"string",
                    "description": "The person's country"
                },
                "Linkedin": {
                    "type":"string",
                    "description": "The hyperlink to the linkedin url in the resume"
                },
                "Companies": {
                    "type":"string",
                    "description": "A list of companies the person worked for"
                },    
                "Role": {
                    "type":"string",
                    "description": "A list of roles the person worked in at the company"            
                }, 
                "Project": {
                    "type":"string",
                    "description": "List out the projects that the candidate mentions in the resume"            
                },        
                
                "University": {
                    "type":"string",
                    "description": "The list of universities this person attended"
                },
            "Degree": {
                    "type":"string",
                    "description": "The degree the person obtained at the university"
                },    
            "Major": {
                    "type":"string",
                    "description": "The major the person studied at the university"
                },
                "CGPA": {
                    "type":"string",
                    "description": "The person's CGPA"
                },
                "Skills": {
                    "type":"string",
                    "description": "You are the Subject Matter Expert of your domain and evaluating the skills highlighted within this resume. List out the skills that the candidate mentions in the resume",
                    "examples":(
                        "Project Management, Lean Six Sigma, Process Automation, Continuous Improvement, Data Analytic’s, Financial Analysis, Business Transformation, Finance & Enterprise Performance, Change Management, Data Engineering, Data Science, Market Research, Process Control, ETL, Talend",
                        ["Python", "SQL", "Tableau", "Data Engineering", "Data Governance", "C", "C++", "Data Analysis", "Requirement Gathering", "Data Visualization", "Data Science", "Data Analytics", "Project Management"]
                    )
                },
                "Certifications": {
                    "type":"string",
                    "description": "A list of certifications the person mentions"
                },
            },
            "required": [
                "Name",
                "Email",
                "Phone",
                "Address",
                "City",
                "State",
                "Country",
                "Linkedin",
                "Companies",
                "Role",
                "Project",
                "University",
                "Degree",
                "Major",
                "CGPA",
                "Skills",
                "Certification",
            ]
        } 
        
        
        
        if self.file_name.endswith(allowed_extensions):
            document_file = os.path.join(self.file_name)
            pprint(document_file)
        else:
            raise ValueError("No valid document file found.")
        
        all_extraction = []
        
        
        # Load the document using the appropriate loader
        if document_file.endswith('.pdf'):
            loader = PyPDFLoader(document_file)
        elif document_file.endswith('.docx') or document_file.endswith('.doc'):
            loader = Docx2txtLoader(document_file)
        elif document_file.endswith('.txt'):
            loader = TextLoader(document_file)
        else:
            raise ValueError(f"Invalid file extension for file {document_file}")

        # Load the document
        documents_all = loader.load()
        
        # LLM 
        llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613",openai_api_key='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')
        prompt = ChatPromptTemplate.from_template(
            """Extract relevant information according to the schema from the following text


        {input}
        """
        )        
        
        document_transformer = create_metadata_tagger(schema, llm, prompt=prompt)
        extracted_document = document_transformer.transform_documents(documents_all)

        # extracted_document is a dictionary, convert it to JSON and append
        all_extraction.append(extracted_document[0].metadata)
        
        return all_extraction

In [4]:
file_name = os.path.join(os.getcwd(),'./HR Documents/CV - Mobile Developer - Ameer Fares.pdf.docx')

In [5]:
# Example usage within a Jupyter Notebook cell:
file_name = os.path.join(os.getcwd(),'./HR Documents/CV - Mobile Developer - Ameer Fares.pdf.docx')
pprint(file_name)

extractor = DocumentPropertyExtractor(file_name)

('c:\\Users\\Amirah\\Python_Projects\\LLM\\./HR Documents/CV - Mobile '
 'Developer - Ameer Fares.pdf.docx')


In [6]:
extraction = extractor.extract_properties()

('c:\\Users\\Amirah\\Python_Projects\\LLM\\./HR Documents/CV - Mobile '
 'Developer - Ameer Fares.pdf.docx')


In [7]:
extraction

[{'Name': 'AMEER FARES',
  'Email': 'ameer@myquickhr.com',
  'Phone': '011-6436 1989',
  'Address': '',
  'City': 'Kuala Lumpur',
  'State': '',
  'Country': 'Malaysia',
  'Linkedin': 'linkedin.com/in/ameerfares',
  'Companies': 'Dell Technologies, QiJang Technologies',
  'Role': 'Software Engineer, Android Developer Freelance, Software Developer Intern',
  'Project': 'Project MMG, Remote Wound Monitoring System, Tender, O2O (Online-2-Offline) Application, Dell Hack2Hire 2022',
  'University': 'University of Malaya',
  'Degree': 'Computer Science',
  'Major': 'Computer Software Engineering',
  'CGPA': 'First-Class Honors',
  'Skills': 'Android Development, Kotlin, JavaScript, MySQL, MongoDB, Firebase, GitHub, Java, Linux, Python',
  'Certifications': 'AWS Academy Graduate - AWS Academy Cloud Foundations - Amazon Web Services (AWS): AWS Certificate Link',
  'source': 'c:\\Users\\Amirah\\Python_Projects\\LLM\\./HR Documents/CV - Mobile Developer - Ameer Fares.pdf.docx'}]