In [7]:
!pip install -U torch torchvision spacy transformers accelerate pdfplumber python-docx beautifulsoup4 requests sentence-transformers
!python -m spacy download en_core_web_sm

Collecting torchvision
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.7.0 which is incompatible.[0m[31m
[0mSuccessfully installed torchvision-0.22.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Downlo

In [8]:
from google.colab import files
uploaded = files.upload()

filename = next(iter(uploaded))
print(f"Uploaded: {filename}")

Saving Rahul_Sivakumar_Resume (1).pdf to Rahul_Sivakumar_Resume (1) (2).pdf
Uploaded: Rahul_Sivakumar_Resume (1) (2).pdf


In [9]:
import pdfplumber
import docx

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

if filename.lower().endswith(".pdf"):
    resume_text = extract_text_from_pdf(filename)
elif filename.lower().endswith(".docx"):
    resume_text = extract_text_from_docx(filename)
else:
    raise ValueError("Unsupported file format. Use PDF or DOCX.")




In [10]:
import re
import spacy
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

embedder = SentenceTransformer('all-MiniLM-L6-v2')

ideal_projects = "Developed scalable applications, led full-stack projects using modern frameworks, applied machine learning for real-world problems."
ideal_experience = "Worked in fast-paced environments, contributed to production systems, collaborated with cross-functional teams using Agile practices."

nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())

def match_section(text, keywords):
    pattern = '|'.join([re.escape(k) for k in keywords])
    return re.search(pattern, text, re.IGNORECASE)

def extract_section(text, section_names, next_section_names=None):
    section_start = None
    section_end = None

    lines = text.splitlines()
    for i, line in enumerate(lines):
        if match_section(line, section_names):
            section_start = i
        elif section_start is not None and next_section_names and match_section(line, next_section_names):
            section_end = i
            break

    if section_start is not None:
        section_end = section_end or len(lines)
        return "\n".join(lines[section_start+1:section_end]).strip()

    return ""

def extract_entities(text):
    doc = nlp(text)
    skills, orgs, projects = set(), set(), set()
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            orgs.add(ent.text)
        elif ent.label_ in ['SKILL', 'PRODUCT', 'WORK_OF_ART']:
            skills.add(ent.text)
        elif ent.label_ == 'EVENT':
            projects.add(ent.text)
    return {
        "skills": list(skills),
        "organizations": list(orgs),
        "projects": list(projects)
    }

def extract_resume_data(text):
    education = extract_section(text, ["Education", "Academic Background"], ["Experience", "Work", "Skills"])
    experience = extract_section(text, ["Experience", "Internship", "Professional Experience", "Work History"], ["Skills", "Achievements", "Certifications"])
    skills = extract_section(text, ["Skills", "Technical Proficiency"], ["Certifications", "Achievements", "Projects"])
    certifications = extract_section(text, ["Certifications", "Licenses"], ["Achievements", "Projects", "Experience"])
    achievements = extract_section(text, ["Achievements", "Honors", "Awards"], ["Projects", "Experience", "Certifications"])
    projects = extract_section(text, ["Projects", "Portfolio"], ["Experience", "Skills"])

    ner_data = extract_entities(text)

    return {
        "education": education,
        "experience": experience,
        "skills": skills,
        "certifications": certifications,
        "achievements": achievements,
        "projects": projects or "\n".join(ner_data.get("projects", [])),
        "ner_skills": ner_data.get("skills", []),
        "ner_orgs": ner_data.get("organizations", [])
    }

MODEL_ID = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


def calculate_similarity_score(text, ideal_text):
    if not text or not text.strip():
        return 0.0
    text_embedding = embedder.encode([text])[0]
    ideal_embedding = embedder.encode([ideal_text])[0]
    similarity = cosine_similarity([text_embedding], [ideal_embedding])[0][0]
    return similarity * 100

def generate_summary(data, plagiarism_insight=None):
    def safe(val):
        return val if val and val.strip() else "Not available"

    education = safe(data.get('education'))
    experience = safe(data.get('experience'))
    projects = safe(data.get('projects'))
    skills = data.get('skills') or ", ".join(data.get('ner_skills', [])) or "Not available"
    certifications = safe(data.get('certifications'))
    achievements = safe(data.get('achievements'))

    summary_parts = [
        f"**Education**:\n{education}",
        f"**Experience**:\n{experience}",
        f"**Projects**:\n{projects}",
        f"**Skills**:\n{skills}",
        f"**Certifications**:\n{certifications}",
        f"**Achievements**:\n{achievements}",
    ]

    if plagiarism_insight:
        summary_parts.append("**Plagiarism Insight**:\n" + plagiarism_insight)

    full_text = "\n\n".join(summary_parts)

    full_text_clean = re.sub(r"[^\x00-\x7F]+", " ", full_text)
    full_text_clean = full_text_clean.strip().replace("\n", " ")
    full_text_clean = full_text_clean[:3000]

    if len(full_text_clean.split()) < 50:
        return full_text

    combined_text = f"{projects}\n{experience}"
    score_projects = calculate_similarity_score(combined_text, ideal_projects)
    score_experience = calculate_similarity_score(combined_text, ideal_experience)
    resume_score = round((score_projects + score_experience) / 2, 2)

    prompt = f"""You are a resume summarizer.

Given the following structured resume content, generate a concise, clear, and professional summary paragraph that highlights the candidate's background, skills, and achievements.

Resume:
{full_text_clean}

Summary:"""

    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

        output = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        summary = decoded.split("Summary:")[-1].strip()

        # Append the score
        summary += f"\n\n**Resume Score**: {resume_score} / 100"
        return summary

    except Exception as e:
        return f"Failed to summarize with LLM. Error: {str(e)}\n\n{full_text}"

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]



In [11]:
import re

def extract_linkedin_url(text):
    pattern = r"https?://(www\.)?linkedin\.com/in/[A-Za-z0-9\-_]+"
    match = re.search(pattern, text)
    return match.group(0) if match else None

In [13]:
resume_data = extract_resume_data(resume_text)

linkedin_url = extract_linkedin_url(resume_text)

summary = generate_summary(resume_data)

with open("resume_summary.txt", "w", encoding="utf-8") as f:
    f.write("Resume Summary\n")
    f.write("="*50 + "\n")
    f.write(summary + "\n")

print("\nSummary with plagiarism insight saved to 'resume_summary.txt'")


Summary with plagiarism insight saved to 'resume_summary.txt'
