In [1]:
pip install pandas scikit-learn joblib pdfminer.six

Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import pdfminer.high_level

def extract_information_from_resume(pdf_path):
    """
    Extracts name, phone number, email, skills, education, and certifications from a PDF resume.

    Args:
        pdf_path (str): The path to the PDF resume file.

    Returns:
        dict: A dictionary containing the extracted information.
              The dictionary may contain the following keys:
                  - "name" (str): The extracted name, or None if not found.
                  - "phone" (str): The extracted phone number, or None if not found.
                  - "email" (str): The extracted email address, or None if not found.
                  - "skills" (list): A list of skills, or an empty list if no skills are found.
                  - "education" (list): A list of education details, or an empty list if no education details are found.
                  - "certifications" (list): A list of certifications, or an empty list if no certifications are found.
    """

    text = extract_text_from_pdf(pdf_path)

    if not text:
        return {}  # Return an empty dictionary if no text could be extracted

    name = extract_name_from_resume(text)
    phone = extract_contact_number_from_resume(text)
    email = extract_email_from_resume(text)
    skills = extract_skills_from_resume(text)
    education = extract_education_from_resume(text)
    certifications = extract_certifications_from_resume(text)

    return {
        "name": name,
        "phone": phone,
        "email": email,
        "skills": skills,
        "education": education,
        "certifications": certifications,
    }


def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text, or None if an error occurs.
    """
    try:
        return pdfminer.high_level.extract_text(pdf_path)
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"An error occurred while extracting text: {e}")
        return None


def extract_name_from_resume(text):
    """
    Extracts the name from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted name, or None if not found.
    """
    # Use regex to find name (assumes name is at the beginning of the text)
    name_match = re.search(r"^[A-Z\s]+", text, re.MULTILINE)  # added multiline to handle name on multiple lines
    if name_match:
        return name_match.group().strip()
    return None


def extract_contact_number_from_resume(text):
    """
    Extracts a contact number from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted contact number, or None if not found.
    """
    # Regex to find phone numbers
    phone_match = re.search(r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)
    if phone_match:
        return phone_match.group()
    return None


def extract_email_from_resume(text):
    """
    Extracts an email address from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted email address, or None if not found.
    """
    # Regex to find email addresses
    email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}", text)
    if email_match:
        return email_match.group()
    return None


def extract_skills_from_resume(text):
    """
    Extracts skills from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        list: A list of extracted skills.
    """
    # Keywords for skills
    skills_keywords = [
        "Python", "Java", "SQL", "Javascript", "HTML", "CSS", "Machine Learning", "Deep Learning",
        "Figma", "UI/UX", "Teamwork", "Communication", "Leadership", "Time Management"
    ]
    extracted_skills = []
    for skill in skills_keywords:
        if re.search(r"\b" + re.escape(skill) + r"\b", text, re.IGNORECASE):
            extracted_skills.append(skill)
    return extracted_skills


def extract_education_from_resume(text):
    """
    Extracts education details from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        list: A list of extracted education details.
    """
    # Regex to find education details
    education_matches = re.findall(r"(EDUCATION.*?)(PROJECTS|CERTIFICATIONS|SKILLS)", text, re.DOTALL | re.IGNORECASE)  # added or conditions to handle missing sections
    if education_matches:
        return [education_matches[0][0].strip()]
    return


def extract_certifications_from_resume(text):
    """
    Extracts certifications from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        list: A list of extracted certifications.
    """
    # Regex to find certifications
    certifications_matches = re.findall(r"(CERTIFICATIONS.*?)(SKILLS|INTERESTS)", text, re.DOTALL | re.IGNORECASE)  # added or conditions to handle missing sections
    if certifications_matches:
        return [certifications_matches[0][0].strip()]
    return


if __name__ == "__main__":
    pdf_path = "UPDATEDRESUMEEDIT.pdf"  # Replace with the actual path to your PDF file
    extracted_data = extract_information_from_resume(pdf_path)

    print(extracted_data)

{'name': 'LAKSHMI VARSHITHA KOTAPATI \n\nG', 'phone': '7672036039', 'email': 'knaga5433@gmail.com', 'skills': ['Python', 'Java', 'SQL', 'Javascript', 'HTML', 'CSS', 'Machine Learning', 'Deep Learning', 'Figma', 'UI/UX', 'Teamwork', 'Communication', 'Leadership', 'Time Management'], 'education': ['EDUCATION \n\nKalasalingam Academy of Research and Education                                                                               2021-2025 \n\nBachelor of Technology in Computer Science and Engineering – CGPA: 8.93 ( present )                   Krishnankoil, Tamil Nadu \n\nNarayana Junior College                                                                                                                            2019-2021 \n\nBoard of Intermediate Education– Percentage: 93%                                                                         Vijayawada, Andhra Pradesh \n\nLittle Flower English Medium School                                                                      

In [10]:
import re
import pdfminer.high_level

def extract_information_from_resume(pdf_path):
    """
    Extracts name, phone number, email, skills, education, and certifications from a PDF resume.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: A paragraph containing the extracted information, 
             with each piece of information on a new line.
    """

    text = extract_text_from_pdf(pdf_path)

    if not text:
        return "Could not extract text from the PDF."

    name = extract_name_from_resume(text)
    phone = extract_contact_number_from_resume(text)
    email = extract_email_from_resume(text)
    skills = extract_skills_from_resume(text)
    education = extract_education_from_resume(text)
    certifications = extract_certifications_from_resume(text)

    result_paragraph = ""

    if name:
        result_paragraph += "Name: " + name + "\n"
    else:
        result_paragraph += "Name: Not found\n"

    if phone:
        result_paragraph += "Phone Number: " + phone + "\n"
    else:
        result_paragraph += "Phone Number: Not found\n"

    if email:
        result_paragraph += "Email: " + email + "\n"
    else:
        result_paragraph += "Email: Not found\n"

    if skills:
        result_paragraph += "Skills: " + ", ".join(skills) + "\n"
    else:
        result_paragraph += "Skills: Not found\n"

    if education:
        result_paragraph += "Education: " + ", ".join(education)  + "\n"
    else:
        result_paragraph += "Education: Not found\n"

    if certifications:
        result_paragraph += "Certifications: " + "\n".join(certifications)
    else:
        result_paragraph += "Certifications: Not found"

    return result_paragraph


def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text, or None if an error occurs.
    """
    try:
        return pdfminer.high_level.extract_text(pdf_path)
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"An error occurred while extracting text: {e}")
        return None


def extract_name_from_resume(text):
    """
    Extracts the name from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted name, or None if not found.
    """
    # Use regex to find name (assumes name is at the beginning of the text)
    name_match = re.search(r"^[A-Z\s]+", text, re.MULTILINE)  # added multiline to handle name on multiple lines
    if name_match:
        return name_match.group().strip()
    return None


def extract_contact_number_from_resume(text):
    """
    Extracts a contact number from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted contact number, or None if not found.
    """
    # Regex to find phone numbers
    phone_match = re.search(r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)
    if phone_match:
        return phone_match.group()
    return None


def extract_email_from_resume(text):
    """
    Extracts an email address from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        str: The extracted email address, or None if not found.
    """
    # Regex to find email addresses
    email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}", text)
    if email_match:
        return email_match.group()
    return None


def extract_skills_from_resume(text):
    """
    Extracts skills from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        list: A list of extracted skills.
    """
    # Keywords for skills
    skills_keywords = [
        "Python", "Java", "SQL", "Javascript", "HTML", "CSS", "Machine Learning", "Deep Learning",
        "Figma", "UI/UX", "Teamwork", "Communication", "Leadership", "Time Management", 'Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau',
    'Java', 'C++', 'JavaScript', 'HTML', 'CSS', 'React', 'Angular', 'Node.js', 'MongoDB', 'Express.js', 'Git',
    'Research', 'Statistics', 'Quantitative Analysis', 'Qualitative Analysis', 'SPSS', 'R', 'Data Visualization', 'Matplotlib',
    'Seaborn', 'Plotly', 'Pandas', 'Numpy', 'Scikit-learn', 'TensorFlow', 'Keras', 'PyTorch', 'NLTK', 'Text Mining',
    'Natural Language Processing', 'Computer Vision', 'Image Processing', 'OCR', 'Speech Recognition', 'Recommendation Systems',
    'Collaborative Filtering', 'Content-Based Filtering', 'Reinforcement Learning', 'Neural Networks', 'Convolutional Neural Networks',
    'Recurrent Neural Networks', 'Generative Adversarial Networks', 'XGBoost', 'Random Forest', 'Decision Trees', 'Support Vector Machines',
    'Linear Regression', 'Logistic Regression', 'K-Means Clustering', 'Hierarchical Clustering', 'DBSCAN', 'Association Rule Learning',
    'Apache Hadoop', 'Apache Spark', 'MapReduce', 'Hive', 'HBase', 'Apache Kafka', 'Data Warehousing', 'ETL', 'Big Data Analytics',
    'Cloud Computing', 'Amazon Web Services (AWS)', 'Microsoft Azure', 'Google Cloud Platform (GCP)', 'Docker', 'Kubernetes', 'Linux',
    'Shell Scripting', 'Cybersecurity', 'Network Security', 'Penetration Testing', 'Firewalls', 'Encryption', 'Malware Analysis',
    'Digital Forensics', 'CI/CD', 'DevOps', 'Agile Methodology', 'Scrum', 'Kanban', 'Continuous Integration', 'Continuous Deployment',
    'Software Development', 'Web Development', 'Mobile Development', 'Backend Development', 'Frontend Development', 'Full-Stack Development',
    'UI/UX Design', 'Responsive Design', 'Wireframing', 'Prototyping', 'User Testing', 'Adobe Creative Suite', 'Photoshop', 'Illustrator',
    'InDesign', 'Figma', 'Sketch', 'Zeplin', 'InVision', 'Product Management', 'Market Research', 'Customer Development', 'Lean Startup',
    'Business Development', 'Sales', 'Marketing', 'Content Marketing', 'Social Media Marketing', 'Email Marketing', 'SEO', 'SEM', 'PPC',
    'Google Analytics', 'Facebook Ads', 'LinkedIn Ads', 'Lead Generation', 'Customer Relationship Management (CRM)', 'Salesforce',
    'HubSpot', 'Zendesk', 'Intercom', 'Customer Support', 'Technical Support', 'Troubleshooting', 'Ticketing Systems', 'ServiceNow',
    'ITIL', 'Quality Assurance', 'Manual Testing', 'Automated Testing', 'Selenium', 'JUnit', 'Load Testing', 'Performance Testing',
    'Regression Testing', 'Black Box Testing', 'White Box Testing', 'API Testing', 'Mobile Testing', 'Usability Testing', 'Accessibility Testing',
    'Cross-Browser Testing', 'Agile Testing', 'User Acceptance Testing', 'Software Documentation', 'Technical Writing', 'Copywriting',
    'Editing', 'Proofreading', 'Content Management Systems (CMS)', 'WordPress', 'Joomla', 'Drupal', 'Magento', 'Shopify', 'E-commerce',
    'Payment Gateways', 'Inventory Management', 'Supply Chain Management', 'Logistics', 'Procurement', 'ERP Systems', 'SAP', 'Oracle',
    'Microsoft Dynamics', 'Tableau', 'Power BI', 'QlikView', 'Looker', 'Data Warehousing', 'ETL', 'Data Engineering', 'Data Governance',
    'Data Quality', 'Master Data Management', 'Predictive Analytics', 'Prescriptive Analytics', 'Descriptive Analytics', 'Business Intelligence',
    'Dashboarding', 'Reporting', 'Data Mining', 'Web Scraping', 'API Integration', 'RESTful APIs', 'GraphQL', 'SOAP', 'Microservices',
    'Serverless Architecture', 'Lambda Functions', 'Event-Driven Architecture', 'Message Queues', 'GraphQL', 'Socket.io', 'WebSockets'
    'Ruby', 'Ruby on Rails', 'PHP', 'Symfony', 'Laravel', 'CakePHP', 'Zend Framework', 'ASP.NET', 'C#', 'VB.NET', 'ASP.NET MVC', 'Entity Framework',
    'Spring', 'Hibernate', 'Struts', 'Kotlin', 'Swift', 'Objective-C', 'iOS Development', 'Android Development', 'Flutter', 'React Native', 'Ionic',
    'Mobile UI/UX Design', 'Material Design', 'SwiftUI', 'RxJava', 'RxSwift', 'Django', 'Flask', 'FastAPI', 'Falcon', 'Tornado', 'WebSockets',
    'GraphQL', 'RESTful Web Services', 'SOAP', 'Microservices Architecture', 'Serverless Computing', 'AWS Lambda', 'Google Cloud Functions',
    'Azure Functions', 'Server Administration', 'System Administration', 'Network Administration', 'Database Administration', 'MySQL', 'PostgreSQL',
    'SQLite', 'Microsoft SQL Server', 'Oracle Database', 'NoSQL', 'MongoDB', 'Cassandra', 'Redis', 'Elasticsearch', 'Firebase', 'Google Analytics',
    'Google Tag Manager', 'Adobe Analytics', 'Marketing Automation', 'Customer Data Platforms', 'Segment', 'Salesforce Marketing Cloud', 'HubSpot CRM',
    'Zapier', 'IFTTT', 'Workflow Automation', 'Robotic Process Automation (RPA)', 'UI Automation', 'Natural Language Generation (NLG)',
    'Virtual Reality (VR)', 'Augmented Reality (AR)', 'Mixed Reality (MR)', 'Unity', 'Unreal Engine', '3D Modeling', 'Animation', 'Motion Graphics',
    'Game Design', 'Game Development', 'Level Design', 'Unity3D', 'Unreal Engine 4', 'Blender', 'Maya', 'Adobe After Effects', 'Adobe Premiere Pro',
    'Final Cut Pro', 'Video Editing', 'Audio Editing', 'Sound Design', 'Music Production', 'Digital Marketing', 'Content Strategy', 'Conversion Rate Optimization (CRO)',
    'A/B Testing', 'Customer Experience (CX)', 'User Experience (UX)', 'User Interface (UI)', 'Persona Development', 'User Journey Mapping', 'Information Architecture (IA)',
    'Wireframing', 'Prototyping', 'Usability Testing', 'Accessibility Compliance', 'Internationalization (I18n)', 'Localization (L10n)', 'Voice User Interface (VUI)',
    'Chatbots', 'Natural Language Understanding (NLU)', 'Speech Synthesis', 'Emotion Detection', 'Sentiment Analysis', 'Image Recognition', 'Object Detection',
    'Facial Recognition', 'Gesture Recognition', 'Document Recognition', 'Fraud Detection', 'Cyber Threat Intelligence', 'Security Information and Event Management (SIEM)',
    'Vulnerability Assessment', 'Incident Response', 'Forensic Analysis', 'Security Operations Center (SOC)', 'Identity and Access Management (IAM)', 'Single Sign-On (SSO)',
    'Multi-Factor Authentication (MFA)', 'Blockchain', 'Cryptocurrency', 'Decentralized Finance (DeFi)', 'Smart Contracts', 'Web3', 'Non-Fungible Tokens (NFTs)']
    
    extracted_skills = []
    for skill in skills_keywords:
        if re.search(r"\b" + re.escape(skill) + r"\b", text, re.IGNORECASE):
            extracted_skills.append(skill)
    return extracted_skills


def extract_education_from_resume(text):
    education = []

    # List of education keywords to match against
    education_keywords = [
        'Computer Science', 'Information Technology', 'Software Engineering', 'Electrical Engineering', 'Mechanical Engineering', 'Civil Engineering',
        'Chemical Engineering', 'Biomedical Engineering', 'Aerospace Engineering', 'Nuclear Engineering', 'Industrial Engineering', 'Systems Engineering',
        'Environmental Engineering', 'Petroleum Engineering', 'Geological Engineering', 'Marine Engineering', 'Robotics Engineering', 'Biotechnology',
        'Biochemistry', 'Microbiology', 'Genetics', 'Molecular Biology', 'Bioinformatics', 'Neuroscience', 'Biophysics', 'Biostatistics', 'Pharmacology',
        'Physiology', 'Anatomy', 'Pathology', 'Immunology', 'Epidemiology', 'Public Health', 'Health Administration', 'Nursing', 'Medicine', 'Dentistry',
        'Pharmacy', 'Veterinary Medicine', 'Medical Technology', 'Radiography', 'Physical Therapy', 'Occupational Therapy', 'Speech Therapy', 'Nutrition',
        'Sports Science', 'Kinesiology', 'Exercise Physiology', 'Sports Medicine', 'Rehabilitation Science', 'Psychology', 'Counseling', 'Social Work',
        'Sociology', 'Anthropology', 'Criminal Justice', 'Political Science', 'International Relations', 'Economics', 'Finance', 'Accounting', 'Business Administration',
        'Management', 'Marketing', 'Entrepreneurship', 'Hospitality Management', 'Tourism Management', 'Supply Chain Management', 'Logistics Management',
        'Operations Management', 'Human Resource Management', 'Organizational Behavior', 'Project Management', 'Quality Management', 'Risk Management',
        'Strategic Management', 'Public Administration', 'Urban Planning', 'Architecture', 'Interior Design', 'Landscape Architecture', 'Fine Arts',
        'Visual Arts', 'Graphic Design', 'Fashion Design', 'Industrial Design', 'Product Design', 'Animation', 'Film Studies', 'Media Studies',
        'Communication Studies', 'Journalism', 'Broadcasting', 'Creative Writing', 'English Literature', 'Linguistics', 'Translation Studies',
        'Foreign Languages', 'Modern Languages', 'Classical Studies', 'History', 'Archaeology', 'Philosophy', 'Theology', 'Religious Studies',
        'Ethics', 'Education', 'Early Childhood Education', 'Elementary Education', 'Secondary Education', 'Special Education', 'Higher Education',
        'Adult Education', 'Distance Education', 'Online Education', 'Instructional Design', 'Curriculum Development'
        'Library Science', 'Information Science', 'Computer Engineering', 'Software Development', 'Cybersecurity', 'Information Security',
        'Network Engineering', 'Data Science', 'Data Analytics', 'Business Analytics', 'Operations Research', 'Decision Sciences',
        'Human-Computer Interaction', 'User Experience Design', 'User Interface Design', 'Digital Marketing', 'Content Strategy',
        'Brand Management', 'Public Relations', 'Corporate Communications', 'Media Production', 'Digital Media', 'Web Development',
        'Mobile App Development', 'Game Development', 'Virtual Reality', 'Augmented Reality', 'Blockchain Technology', 'Cryptocurrency',
        'Digital Forensics', 'Forensic Science', 'Criminalistics', 'Crime Scene Investigation', 'Emergency Management', 'Fire Science',
        'Environmental Science', 'Climate Science', 'Meteorology', 'Geography', 'Geomatics', 'Remote Sensing', 'Geoinformatics',
        'Cartography', 'GIS (Geographic Information Systems)', 'Environmental Management', 'Sustainability Studies', 'Renewable Energy',
        'Green Technology', 'Ecology', 'Conservation Biology', 'Wildlife Biology', 'Zoology']

    for keyword in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(keyword))
        match = re.search(pattern, text)
        if match:
            education.append(match.group())

    return education


def extract_certifications_from_resume(text):
    """
    Extracts certifications from the resume text.

    Args:
        text (str): The resume text.

    Returns:
        list: A list of extracted certifications.
    """
    # Regex to find certifications
    certifications_matches = re.findall(r"(CERTIFICATIONS.*?)(SKILLS|INTERESTS)", text, re.DOTALL | re.IGNORECASE)  # added or conditions to handle missing sections
    if certifications_matches:
        return [certifications_matches[0][0].strip()]
    return


if __name__ == "__main__":
    pdf_path = r'C:\Users\asus\Downloads\projectibm\projectibm\Resume-Screening-with-Machine-Learning-Job-Recommendations-Parsing-Categorization-main\UPDATEDRESUMEEDIT.pdf'  # Replace with the actual path to your PDF file
    extracted_data = extract_information_from_resume(pdf_path)

    print(extracted_data)

Name: LAKSHMI VARSHITHA KOTAPATI 

G
Phone Number: 7672036039
Email: knaga5433@gmail.com
Skills: Python, Java, SQL, Javascript, HTML, CSS, Machine Learning, Deep Learning, Figma, UI/UX, Teamwork, Communication, Leadership, Time Management, Python, Machine Learning, Communication, Deep Learning, SQL, Java, JavaScript, HTML, CSS, Research, Natural Language Processing, Computer Vision, Microsoft Azure, Web Development, Figma
Education: Computer Science, Management, EDUCATION, Web Development
Certifications: CERTIFICATIONS 

•  Microsoft Certified: Azure Fundamentals   
•  Postman API Fundamentals Student Expert Certification  
•  Natural Language Processing in Microsoft Azure  
•  Computer Vision in Microsoft Azure  
•  DIGITAL
