# 1.Import Libaries

In [1]:
!pip install pdfminer.six nltk scikit-learn sentence-transformers rapidfuzz pymupdf pytesseract pillow

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m82.2 MB/s[0m eta [3

In [2]:
import numpy as np
import pandas as pd
import re
import fitz
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz
import os
from PIL import Image
import pytesseract

In [3]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

# 2.Extract text data from PDF/IMAGE

In [4]:
stop = set(stopwords.words('english'))
lem = WordNetLemmatizer()

In [50]:
resume_path='DS_VARUN_REDDY.RESUME.pdf'
jd_path='ds_jd.pdf'

In [51]:
# Define skills dictionary
SKILLS = {
    "programming": ["python", "r", "sql", "java", "scala"],
    "libraries": ["pandas", "numpy", "scikit-learn", "sklearn", "nltk", "tensorflow",
                  "pytorch", "matplotlib", "seaborn", "xgboost", "lightgbm"],
    "ml_nlp": ["tfidf", "naive bayes", "logistic regression", "svm", "random forest",
               "bert", "nlp", "tokenization", "lemmatization", "transformer"],
    "viz_bi": ["tableau", "power bi", "looker", "plotly"],
    "cloud": ["aws", "s3", "lambda", "ec2", "azure", "adf", "databricks", "gcp", "bigquery"],
    "bigdata": ["hadoop", "spark", "hive", "kafka"],
    "tools": ["git", "docker", "linux", "airflow", "jira"]
}
ALL_SKILLS = [s.lower() for v in SKILLS.values() for s in v]

In [52]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                page_text = page.get_text().strip()
                if page_text:
                    text += page_text + " "
                else:
                    # Fallback to OCR if no text found
                    pix = page.get_pixmap(dpi=200)
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    ocr_text = pytesseract.image_to_string(img)
                    text += ocr_text + " "
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

In [53]:
resume_text = extract_text_from_pdf(resume_path)
jd_text = extract_text_from_pdf(jd_path)
print(f"Resume text length: {len(resume_text)}")
print(f"JD text length: {len(jd_text)}")

Resume text length: 3019
JD text length: 3301


# 3.Preprocess the text data

In [54]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s+#]', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lem.lemmatize(token) for token in tokens if token not in stop and len(token) > 2]
    return " ".join(tokens)

In [55]:
clean_resume = clean_text(resume_text)
clean_job = clean_text(jd_text)
print(f"Cleaned resume tokens: {len(clean_resume.split())}")
print(f"Cleaned JD tokens: {len(clean_job.split())}")

Cleaned resume tokens: 323
Cleaned JD tokens: 320


# 4.TF-IDF similarity for raw text data

In [56]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform([clean_resume, clean_job])
tfidf_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# 5.BERT similarity

In [57]:
model = SentenceTransformer('all-MiniLM-L6-v2')
jd_emb = model.encode(clean_job, convert_to_tensor=True)
res_emb = model.encode(clean_resume, convert_to_tensor=True)
bert_sim = util.cos_sim(jd_emb, res_emb).item()

# 6.Relevant Skill Search

In [58]:
def skill_search(clean_text):
    words = set(clean_text.split())
    skills_found = set()
    for skill in ALL_SKILLS:
        if skill in clean_text:
            skills_found.add(skill)
            continue
        for token in words:
            if fuzz.ratio(skill, token) >= 85:
                skills_found.add(skill)
                break
    return skills_found

In [59]:
resume_skills = skill_search(clean_resume)
jd_skills = skill_search(clean_job)
print(f"Resume Skills: {resume_skills}")
print(f"JD Skills: {jd_skills}")
coverage = len(resume_skills & jd_skills) / (len(jd_skills) or 1)

Resume Skills: {'sql', 'pandas', 'numpy', 'seaborn', 'logistic regression', 'tableau', 'linux', 'python', 'r', 'random forest', 'aws', 'git', 'tensorflow', 'hadoop', 'svm', 'matplotlib'}
JD Skills: {'sql', 'pandas', 'numpy', 'hive', 'gcp', 'python', 'r', 'aws', 'spark', 'pytorch', 'tensorflow', 'git', 'hadoop', 'azure', 'transformer'}


# 7.Calculating ATS-SCORE :-

In [60]:
def calculate_ats_score(resume_path, jd_path):

    print("Extracting text from files...")

    print("Cleaning the text received from pdf...")

    print("Calculating TF-IDF similarity...")

    print("Calculating BERT similarity...")

    print("Calculating skill coverage...")


    # Calculate final ATS score
    ats_score = round(100 * (0.34 * tfidf_score + 0.33 * bert_sim + 0.33 * coverage), 2)

    # Print results
    print(f"\n=== ATS SCORE RESULTS ===")
    print(f"TF-IDF Similarity:  {tfidf_score:.3f}")
    print(f"BERT Similarity:    {bert_sim:.3f}")
    print(f"Skill Coverage:     {coverage:.3f}")
    print(f"ATS Score (0-100):  {ats_score}")

    # Detailed analysis
    print(f"\n=== DETAILED ANALYSIS ===")
    print(f"Missing skills from JD: {jd_skills - resume_skills}")
    print(f"Extra skills in resume: {resume_skills - jd_skills}")
    print(f"Matching skills: {resume_skills & jd_skills}")

    return ats_score, tfidf_score, bert_sim, coverage

In [61]:
calculate_ats_score(resume_path,jd_path)

Extracting text from files...
Cleaning the text received from pdf...
Calculating TF-IDF similarity...
Calculating BERT similarity...
Calculating skill coverage...

=== ATS SCORE RESULTS ===
TF-IDF Similarity:  0.274
BERT Similarity:    0.776
Skill Coverage:     0.600
ATS Score (0-100):  54.74

=== DETAILED ANALYSIS ===
Missing skills from JD: {'hive', 'gcp', 'spark', 'pytorch', 'azure', 'transformer'}
Extra skills in resume: {'seaborn', 'linux', 'logistic regression', 'tableau', 'random forest', 'svm', 'matplotlib'}
Matching skills: {'sql', 'pandas', 'numpy', 'python', 'r', 'aws', 'git', 'tensorflow', 'hadoop'}


(np.float64(54.74), np.float64(0.27433796883296746), 0.7760463953018188, 0.6)