In [65]:
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
import joblib

def extract_features(token, index):
    return {
        'token': token.lower(),
        'is_capitalized': token[0].isupper(),
        'is_all_caps': token.isupper(),
        'is_title': token.istitle(),
        'prefix-1': token[:1],
        'prefix-2': token[:2],
        'suffix-1': token[-1:],
        'suffix-2': token[-2:]
    }

def load_training_data(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    X, y = [], []
    for item in data:
        tokens = item["tokens"]
        labels = item["labels"]
        if len(tokens) != len(labels):
            continue  # skip inconsistent entries
        for idx, token in enumerate(tokens):
            X.append(extract_features(token, idx))
            y.append(labels[idx])
    return X, y

# Load and prepare training data
X_raw, y = load_training_data("name_training_data.json")
vec = DictVectorizer(sparse=False)
X_vec = vec.fit_transform(X_raw)

# Train and save model
clf = LogisticRegression(max_iter=200)
clf.fit(X_vec, y)

joblib.dump(clf, "name_model.joblib")
joblib.dump(vec, "name_vectorizer.joblib")

# Evaluate
y_pred = clf.predict(X_vec)
print(classification_report(y, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7034
           1       1.00      1.00      1.00      2728

    accuracy                           1.00      9762
   macro avg       1.00      1.00      1.00      9762
weighted avg       1.00      1.00      1.00      9762



In [63]:
import joblib  # or just `import joblib` if you're using `joblib` directly

def predict_names(tokens):
    clf = joblib.load("name_model.joblib")       # Make sure this model supports predict_proba()
    vec = joblib.load("name_vectorizer.joblib")

    features = [extract_features(tok, idx) for idx, tok in enumerate(tokens)]
    X_vec = vec.transform(features)
    probs = clf.predict_proba(X_vec)  # Get probabilities for each class

    # Class 1 is usually the "name" label
    for tok, prob in zip(tokens, probs):
        print(f"{tok:<15} → NAME prob: {prob[1]:.2f} (Non-name: {prob[0]:.2f})")

    # Get the most confident name predictions
    name_tokens = [tok for tok, prob in zip(tokens, probs) if prob[1] > 0.5]
    return " ".join(name_tokens)

# Example usage
tokens = ["Dr.", "CHARITHA", "SRI", "KUNAPAREDDY", "5", "years"]
print("Predicted Name:", predict_names(tokens))


Dr.             → NAME prob: 0.98 (Non-name: 0.02)
CHARITHA        → NAME prob: 0.68 (Non-name: 0.32)
SRI             → NAME prob: 0.82 (Non-name: 0.18)
KUNAPAREDDY     → NAME prob: 0.94 (Non-name: 0.06)
5               → NAME prob: 0.00 (Non-name: 1.00)
years           → NAME prob: 0.00 (Non-name: 1.00)
Predicted Name: Dr. CHARITHA SRI KUNAPAREDDY


In [45]:
import json

def find_label_mismatches(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    print("Checking for mismatched tokens and labels...\n")
    for i, item in enumerate(data):
        tokens = item.get("tokens", [])
        labels = item.get("labels", [])
        if len(tokens) != len(labels):
            print(f"❌ Mismatch at index {i}:")
            print(f"  Tokens ({len(tokens)}): {tokens}")
            print(f"  Labels ({len(labels)}): {labels}\n")

find_label_mismatches("name_training_data.json")


Checking for mismatched tokens and labels...



In [62]:
import json

input_file = 'extra_ner_name_college_dataset.json'       # your unstructured file
output_file = 'semi_compact_output.json'

# Load the existing pretty-printed JSON
with open(input_file, 'r') as infile:
    data = json.load(infile)  # This must be a list of dicts

# Custom writing logic
with open(output_file, 'w') as outfile:
    outfile.write('[\n')
    for idx, item in enumerate(data):
        json_str = json.dumps(item, indent=None)
        # Pretty-print with keys on separate lines
        parsed = json.loads(json_str)
        outfile.write('  {\n')
        for i, (k, v) in enumerate(parsed.items()):
            comma = ',' if i < len(parsed) - 1 else ''
            line = f'    "{k}": {json.dumps(v)}{comma}\n'
            outfile.write(line)
        comma = ',' if idx < len(data) - 1 else ''
        outfile.write(f'  }}{comma}\n')
    outfile.write(']\n')

print(f"Semi-compact JSON written to {output_file}")

Semi-compact JSON written to semi_compact_output.json


In [1]:
from transformers import pipeline

# Force PyTorch to avoid Keras issues
ner_pipeline = pipeline(
    "token-classification",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple",
    framework="pt"
)

def extract_entities(text):
    entities = ner_pipeline(text)

    name = []
    education = []
    skills = []

    for ent in entities:
        label = ent["entity_group"]
        word = ent["word"]

        if label == "PER":
            name.append(word)
        elif label == "ORG":
            education.append(word)
        elif label == "MISC":
            skills.append(word)

    return {
        "name": " ".join(name),
        "education_orgs": list(set(education)),
        "skills": list(set(skills))
    }


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [2]:
sample_text = """
CHARITHA SRI KUNAPAREDDY
Email: charitha@example.com

Education:
MCA from VR Siddhartha Engineering College, Andhra Pradesh
B.Sc from Triveni Mahila Degree College

Skills: Python, SQL, Machine Learning, Pandas, FastAPI
"""

result = extract_entities(sample_text)
print(result)


{'name': '', 'education_orgs': ['MCA', 'Tri', '##dhartha Engineering College', 'Learning', 'FastAP', 'VR', 'Mahila Degree College', 'CHARITHA SRI KUNAP'], 'skills': ['SQL', 'Machine', 'Pandas', 'Python']}


In [3]:
from transformers import pipeline

# Define models to compare
model_names = {
    "BERT-NER (dslim)": "dslim/bert-base-NER",
    "RoBERTa-NER (Jean-Baptiste)": "Jean-Baptiste/roberta-large-ner-english",
    "Multilingual-BERT (Davlan)": "Davlan/bert-base-multilingual-cased-ner-hrl"
}

# Sample resume snippet (can be longer)
resume_text = """
CHARITHA SRI KUNAPAREDDY
Email: charitha.sri@example.com
Phone: 9876543210

Education:
MCA - VR Siddhartha Engineering College
BSc - Triveni Mahila Degree College

Skills: Python, SQL, HTML, CSS, Java, FastAPI

Experience:
Intern at Salesforce Catalyst
"""

# Run inference across all models
def extract_entities_from_model(model_name):
    print(f"\n{'='*30}\n🔍 Results from: {model_name}\n{'='*30}")
    ner = pipeline(
        "token-classification",
        model=model_names[model_name],
        aggregation_strategy="simple",
        framework="pt"
    )
    entities = ner(resume_text)
    for ent in entities:
        print(f"{ent['word']:25} → {ent['entity_group']}")

# Compare all models
for model in model_names:
    extract_entities_from_model(model)



🔍 Results from: BERT-NER (dslim)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


CHARITHA SRI KUN          → ORG
MCA                       → ORG
VR Siddhartha Engineering College → ORG
Triveni Mahila Degree College → ORG
Python                    → MISC
S                         → MISC
CS                        → MISC
Java                      → MISC
FastAP                    → MISC
Salesforce Catalyst       → ORG

🔍 Results from: RoBERTa-NER (Jean-Baptiste)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


CHARITHA SRI              → PER
 KUNAPAREDDY              → ORG
 charitha                 → PER
sri                       → PER
 VR Siddhartha Engineering College → ORG
 Triveni Mahila Degree College → ORG
 Python                   → MISC
 SQL                      → MISC
 HTML                     → MISC
 CSS                      → MISC
 Java                     → MISC
 FastAPI                  → MISC
 Salesforce Catalyst
     → ORG

🔍 Results from: Multilingual-BERT (Davlan)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


MCA                       → ORG
VR Siddhartha Engineering College → ORG
BS                        → ORG
Triveni Mahila Degree College → ORG


In [7]:
import fitz  # PyMuPDF
import re
from transformers import pipeline

# ---------- Load Hugging Face NER Model ----------
ner_pipeline = pipeline(
    "token-classification",
    model="Jean-Baptiste/roberta-large-ner-english",
    aggregation_strategy="simple",
    framework="pt"
)

# ---------- Extract Text from PDF ----------
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# ---------- Split Resume into Sections ----------
def split_resume_sections(text):
    sections = {}
    current = "general"
    sections[current] = []

    lines = text.split("\n")
    for line in lines:
        clean = line.strip()
        if not clean:
            continue

        # Match section headers
        if re.match(r"^(education|education details|experience|skills|projects|summary|objective|certifications?)[:\s]*$", clean.lower()):
            current = clean.lower().strip(": ")
            sections[current] = []
        else:
            sections.setdefault(current, []).append(clean)
    print(sections)
    return sections

# ---------- Apply NER to Header + Education ----------
def extract_name_and_education_sectional(text):
    sections = split_resume_sections(text)
    
    header_text = "\n".join(sections.get("general", [])[:5])  # top lines only
    edu_text = "\n".join(sections.get("education", []))

    entities_header = ner_pipeline(header_text)
    entities_edu = ner_pipeline(edu_text)

    name_tokens = [e['word'] for e in entities_header if e["entity_group"] == "PER"]
    org_tokens = [e['word'] for e in entities_edu if e["entity_group"] == "ORG"]

    name = " ".join(name_tokens).strip()
    education = list(set(org_tokens))

    return name, education

# ---------- Run the Pipeline ----------
def parse_pdf_resume(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    name, education = extract_name_and_education_sectional(text)
    
    print("👤 Name:", name)
    print("🎓 Education Orgs:", education)


Device set to use cpu


In [8]:
parse_pdf_resume("Charitha Resume.pdf")


{'general': [], 'summary': ['Languages: English, Hindi, Telugu.', 'Masters of Computer Applications - MCA', 'VR Siddhartha Engineering College', 'Pursuing MCA', 'Aug 2020 - Mar 2024', 'Bachelors of Science - Mathematics, Statistics Computer Science', 'Triveni Mahila Degree College', 'CGPA : 7.0', 'Salesforce developer catalyst', 'Learned new emerging technologies currently used in the industry, focusing on job-oriented skills.', 'Gained insights into the working environment of the industry and its specific requirements.', 'Developed essential skills such as communication, interpersonal abilities, and other critical skills', 'necessary for the job interview process.', 'Saloon management system website', 'Developed an intuitive online web-based management application with appointment scheduling', 'functionality, enabling users to easily access information about services, offers, and stylists. The', 'system allows users to seamlessly book appointments with their preferred stylist and canc

In [13]:
import fitz  # PyMuPDF
import re
from transformers import pipeline
from nltk.corpus import stopwords
from nltk import download

# Download stopwords
download('stopwords')
STOP_WORDS = set(stopwords.words("english"))

# ---------- Header Group Mapping ----------
HEADER_GROUPS = {
    "about": ["about", "summary", "about me", "objective"],
    "education": ["education", "education details"],
    "skills": ["technical skills", "skills", "expertise", "strengths and expertise"],
    "experience": ["experience", "professional experience", "projects", "project experience"],
    "additional_information": ["additional information", "more about me", "certifications"]
}

# ---------- Helper Functions ----------
def normalize_text(text):
    return re.sub(r'\s+', ' ', text.strip().lower())

def match_standard_header(text):
    norm = normalize_text(text)
    for std_key, variants in HEADER_GROUPS.items():
        for variant in variants:
            if norm.startswith(variant):
                return std_key
    return None

def clean_text(text):
    words = re.findall(r'\b\w+\b', text)
    filtered = [w for w in words if w.lower() not in STOP_WORDS]
    return " ".join(filtered)

# ---------- Extract and Structure Resume Data ----------
def extract_sections_as_json(pdf_path, spacing_threshold=20):
    doc = fitz.open(pdf_path)
    page = doc[0]

    blocks = page.get_text("blocks")
    blocks = [b for b in blocks if b[4].strip()]
    blocks.sort(key=lambda b: b[1])  # top to bottom

    # Topmost block
    topmost_block = min(blocks, key=lambda b: b[1])
    result = {"header": topmost_block[4].strip()}

    # Group into visual sections
    sections = []
    current_section = []
    prev_y1 = None

    for block in blocks:
        x0, y0, x1, y1, text, *_ = block
        if prev_y1 is not None and (y0 - prev_y1) > spacing_threshold:
            if current_section:
                sections.append(current_section)
                current_section = []
        current_section.append(block)
        prev_y1 = y1

    if current_section:
        sections.append(current_section)

    # Flatten and scan for headers
    all_blocks = [b for section in sections for b in section]
    all_blocks.sort(key=lambda b: b[1])

    current_key = None
    section_data = {}

    for block in all_blocks:
        raw_text = block[4].strip()
        std_key = match_standard_header(raw_text)
        if std_key:
            current_key = std_key
            if current_key not in section_data:
                section_data[current_key] = []
        elif current_key:
            section_data[current_key].append(raw_text)

    # Clean and assign
    for key, texts in section_data.items():
        full_text = " ".join(texts)
        cleaned = clean_text(full_text)
        result[key] = cleaned

    return result

# ---------- Load NER Pipeline ----------
ner_pipeline = pipeline(
    "token-classification",
    model="Jean-Baptiste/roberta-large-ner-english",
    aggregation_strategy="simple",
    framework="pt"
)

# ---------- Extract Name and Education via NER ----------
def extract_name_and_education_from_json(resume_json):
    header_text = resume_json.get("header", "")
    print(header_text)
    education_text = resume_json.get("education", "")

    entities_header = ner_pipeline(header_text)
    
    entities_edu = ner_pipeline(education_text)

    name_tokens = [e['word'] for e in entities_header if e["entity_group"] == "PER"]
    print(name_tokens)
    org_tokens = [e['word'] for e in entities_edu if e["entity_group"] == "ORG"]

    name = " ".join(name_tokens).strip()
    education_orgs = list(set(org_tokens))

    return name, education_orgs

# ---------- Main Pipeline Function ----------
def parse_pdf_resume(pdf_path):
    resume_json = extract_sections_as_json(pdf_path)
    name, education_orgs = extract_name_and_education_from_json(resume_json)

    print("🧾 Parsed Resume Data (JSON):")
    print(resume_json)
    print("\n👤 Name:", name)
    print("🎓 Education Orgs:", education_orgs)
    return {
        "parsed_json": resume_json,
        "name": name,
        "education_orgs": education_orgs
    }

# ---------- Example Usage ----------
result = parse_pdf_resume("Mayukha Resume.pdf")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


OSError: nickmuchi/roberta-base-resume-ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

[{'entity_group': 'LOC',
  'score': np.float32(0.9568997),
  'word': ' Pappulamillu Center',
  'start': 0,
  'end': 19},
 {'entity_group': 'LOC',
  'score': np.float32(0.99787545),
  'word': ' Vijayawada',
  'start': 21,
  'end': 31},
 {'entity_group': 'ORG',
  'score': np.float32(0.96347255),
  'word': 'MAYUKHA KUNAPAREDDY',
  'start': 40,
  'end': 59}]

In [7]:
import fitz  # PyMuPDF
import re
from transformers import pipeline
from nltk.corpus import stopwords
from nltk import download

# Download stopwords
download('stopwords')
STOP_WORDS = set(stopwords.words("english"))

# ---------- Header Group Mapping ----------
HEADER_GROUPS = {
    "about": ["about", "summary", "about me", "objective"],
    "education": ["education", "education details"],
    "skills": ["technical skills", "skills", "expertise", "strengths and expertise"],
    "experience": ["experience", "professional experience", "projects", "project experience"],
    "additional_information": ["additional information", "more about me", "certifications"]
}

# ---------- Helper Functions ----------
def normalize_text(text):
    return re.sub(r'\s+', ' ', text.strip().lower())

def match_standard_header(text):
    norm = normalize_text(text)
    for std_key, variants in HEADER_GROUPS.items():
        for variant in variants:
            if norm.startswith(variant):
                return std_key
    return None

def clean_text(text):
    words = re.findall(r'\b\w+\b', text)
    filtered = [w for w in words if w.lower() not in STOP_WORDS]
    return " ".join(filtered)

# ---------- Extract and Structure Resume Data ----------
def extract_sections_as_json(pdf_path, spacing_threshold=20):
    doc = fitz.open(pdf_path)
    page = doc[0]

    blocks = page.get_text("blocks")
    blocks = [b for b in blocks if b[4].strip()]
    blocks.sort(key=lambda b: b[1])

    topmost_block = min(blocks, key=lambda b: b[1])
    result = {"header": topmost_block[4].strip()}

    sections = []
    current_section = []
    prev_y1 = None

    for block in blocks:
        x0, y0, x1, y1, text, *_ = block
        if prev_y1 is not None and (y0 - prev_y1) > spacing_threshold:
            if current_section:
                sections.append(current_section)
                current_section = []
        current_section.append(block)
        prev_y1 = y1

    if current_section:
        sections.append(current_section)

    all_blocks = [b for section in sections for b in section]
    all_blocks.sort(key=lambda b: b[1])

    current_key = None
    section_data = {}

    for block in all_blocks:
        raw_text = block[4].strip()
        std_key = match_standard_header(raw_text)
        if std_key:
            current_key = std_key
            if current_key not in section_data:
                section_data[current_key] = []
        elif current_key:
            section_data[current_key].append(raw_text)

    for key, texts in section_data.items():
        full_text = " ".join(texts)
        cleaned = clean_text(full_text)
        result[key] = cleaned

    return result

# ---------- Apply NER from Different Models ----------
def extract_name_and_education_with_model(ner_pipeline, resume_json, label_fallback=True):
    header_text = resume_json.get("header", "")
    education_text = resume_json.get("education", "")

    entities_header = ner_pipeline(header_text)
    entities_edu = ner_pipeline(education_text)

    name_tokens = [e['word'] for e in entities_header if e["entity_group"] == "PER"]

    if not name_tokens and label_fallback:
        fallback_orgs = [
            e['word'] for e in entities_header
            if e["entity_group"] == "ORG"
            and e['word'].isupper()
            and 1 <= len(e['word'].split()) <= 3
        ]
        name_tokens = fallback_orgs

    name = " ".join(name_tokens).strip()
    org_tokens = [e['word'] for e in entities_edu if e["entity_group"] == "ORG"]
    education_orgs = list(set(org_tokens))

    return name, education_orgs

# ---------- Load Models ----------
print("🔁 Loading NER models...")
ner_roberta_general = pipeline(
    "token-classification",
    model="Jean-Baptiste/roberta-large-ner-english",
    aggregation_strategy="simple",
    framework="pt"
)


# ---------- Main Runner ----------
def compare_models_on_resume(pdf_path):
    resume_json = extract_sections_as_json(pdf_path)

    name_general, edu_general = extract_name_and_education_with_model(ner_roberta_general, resume_json)
   

    print("🧾 Structured Resume JSON:")
    print(resume_json)
    print("\n=== 📦 MODEL COMPARISON ===")
    print("🔹 Jean-Baptiste/roberta-large-ner-english")
    print("  👤 Name:", name_general)
    print("  🎓 Education:", edu_general)

   

    return {
        "structured_json": resume_json,
        "model_general": {"name": name_general, "education": edu_general}
    }

# ---------- Example Usage ----------
result = compare_models_on_resume("Mayukha Resume.pdf")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔁 Loading NER models...


Device set to use cpu


🧾 Structured Resume JSON:
{'header': 'Pappulamillu Center, Vijayawada, 520007\nMAYUKHA KUNAPAREDDY', 'skills': 'Power Systems Team Leadership Chip Designing Circuit Designing Communication Control Systems Electronics Time Management Stimulation Software Renewable Energy Systems Adaptibility AutoCad', 'education': 'Diploma Electrical Electronics Engineering Govt Polytechnic College Vijayawada August 2022 Present GPA 7 23 Active Participant Planning Execution College Events SSC G E C High School Patamata Vijayawada July 2021 May 2022 GPA 9 2 Actively engaged variety extracurricular activities including yoga essay writing competitions science fairs showcasing well rounded skill set enthusiasm persona l growth creative exploration', 'experience': 'PROJECT Implementation 3 phase Distribution Line Fault Detector Govt Polytechnic College Vijayawada study focuses detecting pole faults power distribution systems using real time monitoring advanced diagnostic techniques proposed approach enhance