In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install pdfplumber spacy scikit-learn joblib
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import os
import re
import json
import pdfplumber
import spacy
import joblib
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize NLP model
nlp = spacy.load("en_core_web_sm")

class ResumeParser:
    def __init__(self):
        # Initialize with default skills and degrees (can be loaded from model)
        self.SKILLS_DB = [
            "Python", "SQL", "Java", "JavaScript", "R", "C++", "C#", "PHP", "Swift", "Go",
            "Machine Learning", "Deep Learning", "TensorFlow", "PyTorch", "Keras",
            "Data Analysis", "Data Visualization", "Data Engineering", "Big Data", "Hadoop",
            "Spark", "Pandas", "NumPy", "SciPy", "Scikit-learn", "NLTK", "OpenCV",
            "Power BI", "Tableau", "Looker", "Qlik", "Excel", "Google Data Studio",
            "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQL Server", "Redis", "Cassandra",
            "AWS", "Azure", "GCP", "Docker", "Kubernetes", "Terraform", "CI/CD",
            "Git", "GitHub", "GitLab", "JIRA", "Linux", "Bash", "Airflow", "Jenkins"
        ]

        self.DEGREES = [
            "B.Tech", "B.E", "Bachelor", "BS", "BSc", "BA", "B.Com", "BBA", "BCA",
            "M.Tech", "M.E", "Master", "MS", "MSc", "MA", "MBA", "MCA", "PGDM",
            "PhD", "Doctorate", "Postdoc", "Diploma", "Associate Degree"
        ]

        # Try to load trained model if exists
        self.load_model()

    def load_model(self):
        """Load trained model with skills and degrees"""
        try:
            model = joblib.load('resume_model.pkl')
            self.SKILLS_DB = model['SKILLS_DB']
            self.DEGREES = model['DEGREES']
            print("Loaded model successfully")
        except:
            print("Using default skills/degrees")
            self.save_model()  # Save default model

    def save_model(self):
        """Save current skills and degrees as model"""
        model = {
            'SKILLS_DB': self.SKILLS_DB,
            'DEGREES': self.DEGREES
        }
        joblib.dump(model, 'resume_model.pkl')
        print("Model saved to resume_model.pkl")

    def extract_text(self, pdf_path):
        """Extract text from PDF resume"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                return "\n".join(page.extract_text() or '' for page in pdf.pages)
        except Exception as e:
            print(f"Error reading PDF: {str(e)}")
            return None

    def extract_name(self, text):
        """Extract candidate name using NLP"""
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                return ent.text
        return "Not Found"

    def extract_skills(self, text):
        """Identify skills from text"""
        text_lower = text.lower()
        return list(OrderedDict.fromkeys(
            skill for skill in self.SKILLS_DB if skill.lower() in text_lower
        ))

    def calculate_experience(self, text):
        """Calculate total work experience in years"""
        text = text.lower()
        total_months = 0
        total_months += sum(int(y)*12 for y in re.findall(r'(\d+)\s*(?:years?|yrs?)', text))
        total_months += sum(int(m) for m in re.findall(r'(\d+)\s*(?:months?|mos?)', text))
        return round(total_months/12, 2)

    def extract_education(self, text):
        """Extract education degrees"""
        return sorted(list(set(
            degree for degree in self.DEGREES
            if re.search(r'\b'+re.escape(degree)+r'\b', text, re.IGNORECASE)
        )))

    def extract_projects(self, text):
        """Extract project mentions"""
        lines = text.split("\n")
        projects = [
            line.strip() for line in lines
            if any(word in line.lower() for word in ["project", "developed", "implemented"])
        ]
        return projects[:3] if projects else ["No projects mentioned"]

    def suggest_role(self, skills):
        """Predict suitable job role based on skills"""
        role_requirements = {
            "Data Analyst": ["SQL", "Excel", "Power BI", "Tableau"],
            "Data Scientist": ["Python", "Machine Learning", "TensorFlow"],
            "Software Engineer": ["Python", "Java", "C++", "JavaScript"],
            "DevOps Engineer": ["Docker", "Kubernetes", "AWS", "CI/CD"],
            "Product Manager": ["Agile", "JIRA", "Product Roadmap"]
        }
        scores = {role: 0 for role in role_requirements}
        for role, req_skills in role_requirements.items():
            for skill in skills:
                if skill in req_skills:
                    scores[role] += 1
        return max(scores.items(), key=lambda x: x[1])[0]

    def calculate_match(self, resume_text, job_desc):
        """Calculate match score between resume and job description"""
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([resume_text, job_desc])
        return round(cosine_similarity(vectors[0], vectors[1])[0][0] * 100, 2)

    def parse_resume(self, pdf_path, job_description=""):
        """Main function to parse resume and return structured data"""
        text = self.extract_text(pdf_path)
        if not text:
            return {"error": "Failed to extract text from PDF"}

        result = {
            "name": self.extract_name(text),
            "skills": self.extract_skills(text),
            "experience": self.calculate_experience(text),
            "education": self.extract_education(text),
            "projects": self.extract_projects(text),
            "suggested_role": self.suggest_role(self.extract_skills(text))
        }

        if job_description:
            result["match_score"] = self.calculate_match(text, job_description)

        return result

# Example Usage
if __name__ == "__main__":
    parser = ResumeParser()

    # Example usage with a PDF file
    result = parser.parse_resume(
        "/content/drive/MyDrive/ATS-Compliant Resume Optimizer/Harshaverse_Resume (1).pdf",
        job_description="Looking for Python developer with Machine Learning experience"
    )

    print(json.dumps(result, indent=2))

    # Save the model file
    parser.save_model()

Using default skills/degrees
Model saved to resume_model.pkl




{
  "name": "B.Tech ComputerScience&Engineering IITGandhinagar",
  "skills": [
    "Python",
    "SQL",
    "R",
    "C++",
    "Go",
    "MySQL",
    "PostgreSQL",
    "Docker",
    "Kubernetes",
    "Git",
    "GitHub"
  ],
  "experience": 0.0,
  "education": [
    "B.Tech"
  ],
  "projects": [
    "\u25e6 DevelopedaRESTAPIandusedNATSmessagingtoenablecommunicationbetweenvariousmicro-",
    "\u25e6 Integratedthedistinctmicro-servicesanddeployedtheprojectoncloudserverbybuildingDocker",
    "\u2022 Freelancer(SolutionWritingProject)[Toppr] Feb-Mar2022"
  ],
  "suggested_role": "Software Engineer",
  "match_score": 2.87
}
Model saved to resume_model.pkl
