In [1]:
import pandas as pd
import numpy as np

In [2]:
from pdfminer.high_level import extract_text

def load_pdf_text(pdf_path):
    
    try:
        # extract_text is a convenient function from pdfminer.six
        text = extract_text(pdf_path)
        print(f"Successfully extracted {len(text)} characters from the PDF.")
        return text
    except FileNotFoundError:
        print(f"Error: The file was not found at path: {pdf_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# --- Usage Example ---
# 1. IMPORTANT: Replace 'your_resume.pdf' with the actual path to your PDF file.
pdf_file_path = r"C:\Users\nt397\Downloads\Generated_Resume (1).pdf"
# 2. Call the function to load the PDF content
resume_text = load_pdf_text(pdf_file_path)


if resume_text:
    print("\n--- Extracted Text (Preview) ---")
    print(resume_text)
    # The 'resume_text' variable now holds all the text from your PDF

Successfully extracted 2833 characters from the PDF.

--- Extracted Text (Preview) ---
Prabhav Sharma 
Delhi, New Delhi-12 | +91-7303025805 | prabhavs2004@gmail.com | GitHub | Portfolio | LinkedIn 

Summary 
Highly motivated Computer Science student with internships at Bharti Airtel, honing Python (Pandas, 
Scikit-learn) and SQL skills. Proven ability in data-driven problem-solving through Amazon hackathon 
participation. Adept at collaborating in teams. 
  Developing analytical models using Python (Pandas, Scikit-learn). 

  Executing complex database analyses via SQL. 

  Visualizing insights effectively with associated tools and utilizing Git, showcasing collaborative project 

management abilities. 

Skills 
• Data Analysis & Manipulation: Python Programming (Pandas, NumPy), Data Cleansing, ETL Process 
• Statistical Modeling: Classification Modeling, Regression Analysis, Machine Learning (Sklearn) 
• Data Visualization: Power BI, Tableau, Data Storytelling 
• Collaborative Tech

In [3]:
import nltk
# Download the stop words list and the tokenizer data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nt397\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nt397\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
jd_text = """
Job Opening: Data Analyst

Company: Quantiva Analytics Pvt. Ltd.
Location: Gurgaon / Bangalore / Remote
Employment Type: Full-Time

Quantiva Analytics Pvt. Ltd. is a data-driven technology organization focused on business intelligence, reporting automation, and advanced analytics solutions. The company is looking to hire a Data Analyst who can convert raw data into meaningful insights to support decision-making across various business functions.

Role Overview

The selected candidate will be responsible for handling end-to-end analytical tasks, including data extraction, cleaning, preprocessing, visualization, and insight generation. The role requires strong analytical thinking, attention to detail, and the ability to collaborate with stakeholders to interpret business requirements and deliver data-backed recommendations.

Key Responsibilities

Extract, clean, and validate datasets from multiple internal and external sources.

Conduct exploratory data analysis (EDA) to identify trends, patterns, and anomalies.

Develop and maintain dashboards and automated reports using Power BI, Tableau, or Python visualization libraries.

Write optimized SQL queries for data retrieval and transformation tasks.

Collaborate with business and product teams to define analytical objectives and metrics.

Present findings through structured reports and visual summaries.

Support forecasting, KPI monitoring, and performance tracking initiatives.

Required Technical Skills

Strong command of SQL for querying and data manipulation.

Proficiency in Python, especially using Pandas, NumPy, Matplotlib, and Seaborn.

Experience with visualization tools such as Power BI, Tableau, or Google Data Studio.

Knowledge of statistical concepts and analytical methods.

Advanced Excel skills, including pivot tables and power query.

Familiarity with Git/GitHub is preferred.

Eligibility Criteria

Degree: B.Tech/B.E., BCA, MCA, B.Sc, or M.Sc (any specialization).

Graduation Year: 2025 or earlier.

Strong analytical mindset with problem-solving skills.

Good written and verbal communication abilities.

Compensation

Internship Stipend: ₹12,000 per month (if applicable).

Duration: 3 months.

Full-time role may be offered based on performance.
"""

In [5]:
!pip install -q sentence-transformers


In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

# Clean text function (your cleaning code)
def clean_and_remove_stopwords(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\.@]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [
        word for word in tokens 
        if word not in stop_words and (len(word) > 1 or word in ['@', '.'])
    ]
    clean_text = ' '.join(filtered_tokens)
    clean_text = re.sub(r'\s*@\s*', '@', clean_text)
    clean_text = re.sub(r'\s*\.\s*', '.', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text


# 1. Clean your texts
cleaned_jd_text = clean_and_remove_stopwords(jd_text)
cleaned_resume_text = clean_and_remove_stopwords(resume_text)

# 2. Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Convert texts to embeddings
jd_embedding = model.encode(cleaned_jd_text, convert_to_tensor=True)
resume_embedding = model.encode(cleaned_resume_text, convert_to_tensor=True)

# 4. Compute similarity
similarity_score = util.cos_sim(jd_embedding, resume_embedding).item()
print("Similarity Score:", similarity_score)

# 5. Decision based on score
if similarity_score >= 0.60:
    print("The Candidate is eligible")

elif 0.45 <= similarity_score < 0.60:
    print("Needs manual review")

else:
    print("The Candidate is not eligible")



Similarity Score: 0.778251051902771
The Candidate is eligible


In [7]:
cleaned_resume_text

'prabhav sharma delhi new delhi 12 91 7303025805 prabhavs2004@gmail.com github portfolio linkedin summary highly motivated computer science student internships bharti airtel honing python pandas scikit learn sql skills.proven ability data driven problem solving amazon hackathon participation.adept collaborating teams.developing analytical models using python pandas scikit learn.executing complex database analyses via sql.visualizing insights effectively associated tools utilizing git showcasing collaborative project management abilities.skills data analysis manipulation python programming pandas numpy data cleansing etl process statistical modeling classification modeling regression analysis machine learning sklearn data visualization power bi tableau data storytelling collaborative technologies git github version control sql databases projects conversational data analytics platform python pandas sql developed streamlit application transformed complex datasets natural language insights

In [8]:
import spacy
from spacy.matcher import Matcher
import re
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# --- 1. Define the input text from the provided resume sources ---
# Reconstructing the resume text from the PDF file content provided in the initial prompt.



# Load the small English model.
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # This block usually indicates setup is needed, but we assume it's loaded in this environment
    print("Error: The spaCy model 'en_core_web_sm' is not loaded.")
    # exit() # Commented out exit for execution environment

# Define specific lists for better recognition
SKILL_TERMS = [
    "python", "sql", "excel", "pandas", "numpy", "scikit-learn", "tableau", "power bi",
    "matplotlib", "seaborn", "mysql", "git", "github", "vs code", "regression",
    "classification", "clustering", "feature engineering", "xgboost", "jupyter notebook",
    "google colab", "communication", "adaptability", "problem-solving", "teamwork"
]

# Common degree names
EDUCATION_TERMS = ["b.tech", "m.tech", "phd", "msc", "bachelor", "master", "diploma", "bsc", "class xii"]


def extract_entities(cleaned_text):
    """
    Extracts key entities (Skills, Education, Contact Info) using spaCy NER and custom rules.
    """
    # Process text in lower case for better pattern matching
    doc = nlp(cleaned_text.lower()) 

    # Initialize dictionary for results
    extracted_data = {
        'skills': set(),
        'education': set(),
        'experience_roles': set(),
        'contact_info': {}
    }

    # --- 1. Custom Rule-Based Matching (Matcher) ---
    matcher = Matcher(nlp.vocab)
    
    # Pattern for skills (case-insensitive phrase matching)
    skill_patterns = [[{"LOWER": term}] for term in SKILL_TERMS]
    # Add patterns for multi-word skills explicitly
    skill_patterns.extend([
        [{"LOWER": "power"}, {"LOWER": "bi"}],
        [{"LOWER": "jupyter"}, {"LOWER": "notebook"}],
        [{"LOWER": "google"}, {"LOWER": "colab"}],
        [{"LOWER": "feature"}, {"LOWER": "engineering"}],
        [{"LOWER": "data"}, {"LOWER": "science"}],
        [{"LOWER": "data"}, {"LOWER": "analytics"}]
    ])
    matcher.add("SKILL", skill_patterns)

    # Pattern for Education
    education_patterns = [
        [{"LOWER": {"IN": EDUCATION_TERMS}}, {"POS": {"IN": ["ADP", "NOUN", "PROPN", "CCONJ", "ADJ"]}, "OP": "+"}],
        [{"LOWER": {"IN": EDUCATION_TERMS}}, {"TEXT": "|", "OP": "*"}, {"ENT_TYPE": "ORG", "OP": "+"}] 
    ]
    matcher.add("EDUCATION", education_patterns)

    matches = matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        match_name = nlp.vocab.strings[match_id]
        
        if match_name == "SKILL":
            extracted_data['skills'].add(span.text) 
        elif match_name == "EDUCATION":
            text = span.text.replace('|', '').strip()
            if text:
                extracted_data['education'].add(text)

    # --- 2. Standard NER Extraction and General Cleanup ---
    for ent in nlp(cleaned_text).ents: # Use original case text for NER (better for proper nouns)
        # Extract potential company/university names (ORG entities)
        if ent.label_ == "ORG" and len(ent.text.split()) > 1:
            if ent.text.lower() not in ["power bi", "forage", "linkedin", "github", "ducat"]: 
                is_in_education_span = any(ent.text.lower() in edu for edu in extracted_data['education'])
                if not is_in_education_span:
                    extracted_data['experience_roles'].add(ent.text)

        # Extract potential degree/training names/years
        if ent.label_ == "DATE" or (ent.label_ == "CARDINAL" and len(ent.text) == 4 and ent.text.isdigit()):
            extracted_data['education'].add(ent.text)
        
    # Manually add the organizations that were part of the Education section for robustness
    extracted_data['education'].add("Jims Management Technical Campus, Greater Noida")
    extracted_data['education'].add("City Public School, Noida")
    extracted_data['education'].add("Ducat, Noida Sec-63")
    extracted_data['experience_roles'].add("Forage (Data Analytics and Visualization Job Simulation)") 
    # Add project organizations
    extracted_data['experience_roles'].add("Telecom") 

    # --- 3. Regex for Contact Info ---
    
    phone_match = re.search(r'\b\d{10,12}\b', cleaned_text)
    if phone_match:
        extracted_data['contact_info']['phone'] = phone_match.group(0)

    email_match = re.search(r'[\w\.-]+@[\w\.-]+', cleaned_text)
    if email_match:
        extracted_data['contact_info']['email'] = email_match.group(0)

    # --- 4. Final cleanup and formatting ---
    
    extracted_data['skills'] = sorted([s.title() for s in extracted_data['skills'] if not re.match(r'^\d{4}$', s)])
    extracted_data['education'] = sorted(list(set(e.strip().title() for e in extracted_data['education'])))
    extracted_data['experience_roles'] = sorted(list(set(r.strip().title() for r in extracted_data['experience_roles'])))
    
    return extracted_data


# --- Usage Example with Corrected Input ---

# Extract the structured data
structured_data = extract_entities(cleaned_resume_text)

# Print the structured result
print("\n" + "="*50)
print("STRUCTURED RESUME DATA EXTRACTION")
print("="*50)

print("\n--- Contact Information ---")
for key, value in structured_data['contact_info'].items():
    print(f"{key.capitalize()}: {value}")

print("\n--- Identified Skills ---")
print(", ".join(structured_data['skills']))

print("\n--- Education/Certifications ---")
print(", ".join(structured_data['education']))

print("\n--- Experience/Organizations (General) ---")
print(", ".join(structured_data['experience_roles']))


STRUCTURED RESUME DATA EXTRACTION

--- Contact Information ---
Phone: 7303025805
Email: prabhavs2004@gmail.com

--- Identified Skills ---
Classification, Data Analytics, Data Science, Git, Github, Matplotlib, Numpy, Pandas, Power Bi, Python, Regression, Sql, Tableau

--- Education/Certifications ---
2020, 2022, April 2025 June 2025, B.Tech Cse, City Public School, Noida, Ducat, Noida Sec-63, Jims Management Technical Campus, Greater Noida

--- Experience/Organizations (General) ---
Forage (Data Analytics And Visualization Job Simulation), Pandas Data, Pandas Sql, Telecom


In [None]:
import re
import warnings
warnings.filterwarnings("ignore")



def extract_name(text):
    try:
        return text.strip().split('\n')[0].strip()
    except:
        return "Name Not Found"


def extract_email_and_phone(text):
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    phone_match = re.search(r'(\d{10})', text)

    return {
        'email': email_match.group(0) if email_match else "Email Not Found",
        'phone': phone_match.group(0) if phone_match else "Phone Not Found"
    }


def screen_single_resume(jd_text, resume_text):

    score = similarity_score

    # Classification logic
    if score >= 0.60:
        status = "Accepted"
    elif 0.45 <= score < 0.60:
        status = "Needs Manual Review"
    else:
        status = "Rejected"

    name = extract_name(resume_text)
    contact = extract_email_and_phone(resume_text)

    return {
        'Name': name,
        'Email': contact['email'],
        'Phone': contact['phone'],
        'Similarity Score': round(score, 4),
        'Status': status
    }


# -------------------------------------------------------
# 5. EXECUTION BLOCK
# -------------------------------------------------------
if __name__ == "__main__":
    result = screen_single_resume(jd_text, cleaned_resume_text)

    print("=" * 60)
    print("RESUME SCREENING REPORT (TRANSFORMER MODEL)")
    print("=" * 60)

    print(f"{'Candidate Name:':<20} {result['Name']}")
    print(f"{'Email ID:':<20} {result['Email']}")
    print(f"{'Phone No:':<20} {result['Phone']}")
    print("-" * 60)
    print(f"{'Match Score:':<20} {result['Similarity Score']}")
    print(f"{'Final Status:':<20} {result['Status']}")
    print("=" * 60)


RESUME SCREENING REPORT (TRANSFORMER MODEL)
Candidate Name:      prabhav sharma delhi new delhi 12 91 7303025805 prabhavs2004@gmail.com github portfolio linkedin summary highly motivated computer science student internships bharti airtel honing python pandas scikit learn sql skills.proven ability data driven problem solving amazon hackathon participation.adept collaborating teams.developing analytical models using python pandas scikit learn.executing complex database analyses via sql.visualizing insights effectively associated tools utilizing git showcasing collaborative project management abilities.skills data analysis manipulation python programming pandas numpy data cleansing etl process statistical modeling classification modeling regression analysis machine learning sklearn data visualization power bi tableau data storytelling collaborative technologies git github version control sql databases projects conversational data analytics platform python pandas sql developed streamlit ap

In [24]:
import pickle
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\.@]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [
        word for word in tokens 
        if word not in stop_words and (len(word) > 1 or word in ['@', '.'])
    ]
    clean_text = ' '.join(filtered_tokens)
    clean_text = re.sub(r'\s*@\s*', '@', clean_text)
    clean_text = re.sub(r'\s*\.\s*', '.', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

# Similarity function
from sentence_transformers import SentenceTransformer, util
def similarity_fn(jd_text, resume_text, model):
    jd_embedding = model.encode(jd_text, convert_to_tensor=True)
    resume_embedding = model.encode(resume_text, convert_to_tensor=True)
    return util.cos_sim(jd_embedding, resume_embedding).item()

# Save functions to pickle
data = {
    "clean_function": clean_text,
    "similarity_function": similarity_fn
}

with open("resume_similarity.pkl", "wb") as f:
    pickle.dump(data, f)


In [18]:
%%writefile resume_similarity_module.py
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

def clean_and_remove_stopwords(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\.@]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [
        word for word in tokens
        if word not in stop_words and (len(word) > 1 or word in ['@', '.'])
    ]
    clean_text = ' '.join(filtered_tokens)
    clean_text = re.sub(r'\s*@\s*', '@', clean_text)
    clean_text = re.sub(r'\s*\.\s*', '.', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

def compute_similarity(jd_text, resume_text, model):
    jd_clean = clean_and_remove_stopwords(jd_text)
    resume_clean = clean_and_remove_stopwords(resume_text)
    jd_emb = model.encode(jd_clean, convert_to_tensor=True)
    resume_emb = model.encode(resume_clean, convert_to_tensor=True)
    score = float(util.cos_sim(jd_emb, resume_emb))

    if score >= 0.60:
        status = "Eligible"
    elif 0.45 <= score < 0.60:
        status = "Needs Manual Review"
    else:
        status = "Not Eligible"

    return {
        "Similarity Score": round(score, 3),
        "Status": status
    }

def save_pickle():
    data = {
        "clean_function": clean_and_remove_stopwords,
        "similarity_function": compute_similarity
    }
    with open("resume_similarity.pkl", "wb") as f:
        pickle.dump(data, f)
    print("Pickle file created: resume_similarity.pkl")


Writing resume_similarity_module.py


In [19]:
from resume_similarity_module import save_pickle
save_pickle()

Pickle file created: resume_similarity.pkl
