## Validation Notebook


In [1]:
from collections import Counter
import nlptk
from nlptk import DictVsText
from gliner import GLiNER
from pathlib import Path
import os
from tabulate import tabulate
import json
import re
import copy
import time
from dataclasses import dataclass
import base64
import json
import tqdm
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import random
from nlptk import FileReader

import requests

PORT = "8001"
HOST = "10.0.0.105"
BASE_URL = f"http://{HOST}:{PORT}"
# BASE_URL = f"http://budgie.local:{PORT}"


@dataclass
class Endpoint:
    hello = f"{BASE_URL}/hello"
    static = f"{BASE_URL}/static"
    parse_resume_text = f"{BASE_URL}/parse_resume"
    parse_resume_doc = f"{BASE_URL}/parse_resume_doc"



outdir = Path.cwd()
home = Path.home()
dataset_dir = home.joinpath("Work/ResumeParser_RnD/Train/20250429/v3dataset")
test_data_path = dataset_dir.joinpath("test.jsonl")     # 4968 records
validation_data_path = dataset_dir.joinpath("validation.jsonl")    # 14774 records
gold_data_path = dataset_dir.joinpath("gold.jsonl")     # 481 records



# data_dir = Path("/Users/chagerman/Data/Jobscan/Resumes/Problems/Missing_Education")
# paths  = [data_dir.joinpath(x) for x in os.listdir(data_dir)]
# paths.sort()

In [2]:



def load_jsonlines(path):
    return [json.loads(x) for x in open(path)]


# response = requests.get(Endpoint.hello)
def test_parse_resume_doc(path: Path):
    filename = path.name
    with open(path, "rb") as fo:
        encoded_string = base64.b64encode(fo.read())

    payload = {"filename": filename, "filedata": encoded_string}
    response = requests.post(url=Endpoint.parse_resume_doc, data=payload)
    return response


def test_parse_resume_txt(text):
    data = {"text": text}
    headers = {"Content-Type": "application/json"}
    response = requests.post(Endpoint.parse_resume_text, headers=headers, data=json.dumps(data))
    return response


def extract_education(text, pattern_dict):
    label = "Education"
    pattern_dict[label] == label
    education = pattern_dict[label]
    entities = []
    # pat1 = re.compile(fr"\n\s?({re.escape(term)})\b", re.IGNORECASE)
    # pat2 = re.compile(fr"\s?({re.escape(term)})\b", re.IGNORECASE)
    for term in education:
        m = re.search(fr"\n\s?({re.escape(term)})", text, re.IGNORECASE)
        if m:
            t = m.group(1)
            start, end = m.span()
            entities.append({
                'start': start,
                'end': end,
                'text': t,
                'labels': label
            }
            )
            break
    if not entities:
        for term in education:
            m = re.search(fr"\s?({re.escape(term)})\b", text, re.IGNORECASE)
            if m:
                t = m.group(1)
                start, end = m.span()
                entities.append({
                    'start': start,
                    'end': end,
                    'text': t,
                    'labels': label
                }
                )
    return entities


def get_profile_job_title(text, nermodel):
    label_threshold = 5
    lines = [x for x in text.split("\n") if x.strip()][:label_threshold]
    text2 = "\n".join(lines)
    labels = ["JobTitle"]
    NER_THRESHOLD = 0.57
    entities = nermodel.predict_entities(text2, labels, threshold=NER_THRESHOLD)
    positions = [e for e in entities if e["label"] == "JobTitle"]
    names = [e for e in entities if e["label"] == "Person"]
    job_title = "" if len(positions) == 0 else positions[0]["text"]
    ignore_list = ["JD"]
    if job_title in ignore_list:
        job_title = ""
    return job_title


def select_correct_job_title(label, ner, text):
    job_title = label
    label_threshold = 5
    lines = [x for x in text.split("\n") if x.strip()][:label_threshold]
    top_text = "\n".join(lines)
    m0 = None if not ner else re.search(re.escape(ner), top_text)
    m = re.search(re.escape(label), top_text)
    if label:
        # print("label not empty")
        # m = re.search(re.escape(label), top_text)
        if m:
            # print("label exists in top_text")
            m2 = re.search(re.escape(ner), label)
            if m2 or label.startswith(ner):
                # print("label starts with NER")
                job_title = label
            if m0 and m:
                if m0.span()[0] < m.span()[0]:
                    job_title = ner
                else:
                    job_title = label
            # else:
            #     print("label does not start with NER")
            #     job_title = ner
        else:
            # print("label does not exist in top_text")
            job_title = ner
    else:
        # print("label is empty")
        job_title = ner

    theline = ""
    for line in top_text.split("\n"):
        m3 = re.search(re.escape(job_title), line)
        if m3:
            theline = line
            # check for COMMA, PIPE
            m4 = re.search(r"^ ?[,|-]", theline[m3.span(0)[1]:])
            if m4:
                extract = theline[m3.span()[0]:]
                extract = re.sub(r"\s{3,}.*$", "", extract)
                if len(extract) < 100:
                    job_title = extract
            break

    return job_title.strip()


def check_profile_job_title(d, nermodel, verbose=False, predicted_score=None):
    text = d["text"]
    BASIC_THRESHOLD = 8
    lines = [x for x in text.split("\n") if x][:BASIC_THRESHOLD]

    work = re.search(r"\n\s*(work experience|education|professional experience|experience)\s*\n", "\n".join(lines[1:]),
                     re.IGNORECASE)
    if work:
        offset = len(lines[0]) + work.span(1)[0]
        text = text[:offset].strip()

    # lines = text.split("\n")
    basics = "\n".join(lines[:9])
    jr = d["jsonresume"]
    orig_label = jr["basics"]["label"].strip()
    predicted_job_title = get_profile_job_title(text, nermodel)



    defacto_label = select_correct_job_title(orig_label, predicted_job_title, text)

    # prod metrics ----------------------------------------------------------------------
    metrics = dict()
    labels = dict()
    if orig_label == "":
        m = None
    else:
        term = re.escape(orig_label)
        m = re.search(term, basics)

    # print(f"orig_label: {orig_label}   m: {m}")
    
    # if orig_label and m:
    #     score = "tp"
    # elif orig_label and not m:
    #     score = "fp"
    # elif not orig_label and predicted_job_title != "":
    #     score = "fn"
    # elif not orig_label and not m and predicted_job_title == "":
    #     score = "tn"
    
    if orig_label != ""   and orig_label == defacto_label:
        score = "tp"
    elif orig_label != "" and orig_label != defacto_label:
        score = "fp"
    elif orig_label != "" and defacto_label == "":
        score = "fp"
    elif orig_label == "" and defacto_label != "":
        score = "fn"
    elif orig_label == "" and defacto_label == "":
        score = "tn"
        
    else:
        print("ERROR:")
        print(f"prod_label:     {orig_label}")
        print(f"defacto_label:  {defacto_label}")
        print(f"m:  {m}")
    metrics["prod"] = score
    labels["prod"] = orig_label
    score = ""
    
    # build metrics ----------------------------------------------------------------------
    # build_label = predicted_job_title if predicted_job_title else orig_label
    build_label = predicted_job_title
    if build_label == "":
        m = None
    else:
        term = re.escape(build_label)
        m = re.search(term, basics)

    # print(f"build_label: {build_label}   m: {m}")
    
    # if build_label and m:
    #     score = "tp"
    # elif build_label and not m:
    #     score = "fp"
    # elif not build_label and predicted_job_title != "":
    #     score = "fn"
    # elif not build_label and not m and predicted_job_title == "":
    #     score = "tn"

    if build_label != ""   and build_label == defacto_label:
        score = "tp"
    elif build_label != "" and build_label != defacto_label:
        score = "fp"
    elif build_label != "" and defacto_label == "":
        score = "fp"
    elif build_label == "" and defacto_label != "":
        score = "fn"
    elif build_label == "" and defacto_label == "":
        score = "tn"


    
    else:
        print("ERROR:")
        print(f"build_label:  {build_label}")
        print(f"m:  {m}")
    metrics["build"] = score
    labels["build"] = build_label
    score = ""


    # dev metrics ----------------------------------------------------------------------
    label = select_correct_job_title(orig_label, predicted_job_title, text)

    if label == "":
        m = None
    else:
        term = re.escape(label)
        m = re.search(term, basics)

    # print(f"dev_label: {label}   m: {m}")
    
    if label and m:
        score = "tp"
    elif label and not m:
        score = "fp"
    elif not label and predicted_job_title != "":
        score = "fn"
    elif not label and not m and predicted_job_title == "":
        score = "tn"
    else:
        print("ERROR:")
        print(f"label:  {label}")
        print(f"m:  {m}")

    metrics["dev"] = score
    labels["dev"] = label



    if predicted_score and score != predicted_score:
        verbose = True
    if verbose:
        # print(f"score:  {score.upper()} ")
        # print(f"\tlabel:  {label}    orig_label:  {orig_label}  NER: {predicted_job_title}")

        print(d["id"])
        data = [["Score", "Predicted Label", "Original Label", "Gliner Label"],
                [score.upper(), label, orig_label, predicted_job_title]]
        print(tabulate(data, headers="firstrow", tablefmt="grid"))
        print(basics)
        print("\n")
        print("-" * 80)
        print("\n")
    labels["ground_truth"] = predicted_job_title
    result = {"id": d["id"], "metrics": metrics, "labels": labels}
    return result






def compare_to_gold(d, nermodel, verbose=False, predicted_score=None):
    text = d["text"]

    gold = json.loads(d["data"])["basics"]["label"]
    gold = "" if not gold else gold
    # print(f"GOLD" : >{gold}< ")

    BASIC_THRESHOLD = 8
    lines = [x for x in text.split("\n") if x][:BASIC_THRESHOLD]

    work = re.search(r"\n\s*(work experience|education|professional experience|experience)\s*\n", "\n".join(lines[1:]),
                     re.IGNORECASE)
    if work:
        offset = len(lines[0]) + work.span(1)[0]
        text = text[:offset].strip()

    # lines = text.split("\n")
    basics = "\n".join(lines[:9])
    jr = d["jsonresume"]
    orig_label = jr["basics"]["label"]
    predicted_job_title = get_profile_job_title(text, nermodel)


    # prod metrics ----------------------------------------------------------------------
    metrics = dict()
    labels = dict()
    if orig_label == "":
        m = None
    else:
        term = re.escape(orig_label)
        m = re.search(term, basics)

    
    # if orig_label and m:
    #     score = "tp"
    # elif orig_label and not m:
    #     score = "fp"
    # elif not orig_label and predicted_job_title != "":
    #     score = "fn"
    # elif not orig_label and not m and predicted_job_title == "":
    #     score = "tn"

    if orig_label != "" and orig_label == gold:
        score = "tp"
    elif orig_label != "" and orig_label != gold:
        score = "fp"
    elif orig_label != "" and orig_label != gold:
        score = "fp"
    elif orig_label == "" and gold != "":
        score = "fn"
    elif orig_label == "" and gold == "":
        score = "tn"
    
    else:
        print("prod ERROR:")
        print(f"orig_label:  {orig_label}")
        print(f"m:  {m}")
    metrics["prod"] = score
    labels["prod"] = orig_label
    score = ""
    
    # build metrics ----------------------------------------------------------------------
    # build_label = predicted_job_title if predicted_job_title else orig_label
    build_label = predicted_job_title
    if build_label == "":
        m = None
    else:
        term = re.escape(build_label)
        m = re.search(term, basics)

    
    # if build_label and m:
    #     score = "tp"
    # elif build_label and not m:
    #     score = "fp"
    # elif not build_label and predicted_job_title != "":
    #     score = "fn"
    # elif not build_label and not m and predicted_job_title == "":
    #     score = "tn"

    if build_label != "" and build_label == gold:
        score = "tp"
    elif build_label != "" and build_label != gold:
        score = "fp"
    elif build_label != "" and build_label != gold:
        score = "fp"
    elif build_label == "" and gold != "":
        score = "fn"
    elif build_label == "" and gold == "":
        score = "tn"
        
    else:
        print("build ERROR:")
        print(f"build_label:  {build_label}")
        print(f"m:  {m}")
    metrics["build"] = score
    labels["build"] = build_label
    score = ""


    # dev metrics ----------------------------------------------------------------------
    label = select_correct_job_title(orig_label, predicted_job_title, text)

    if label == "":
        m = None
    else:
        term = re.escape(label)
        m = re.search(term, basics)
    
    # if label and m:
    #     score = "tp"
    # elif label and not m:
    #     score = "fp"
    # elif not label and predicted_job_title != "":
    #     score = "fn"
    # elif not label and not m and predicted_job_title == "":
    #     score = "tn"

    if label != "" and label == gold:
        score = "tp"
    elif label != "" and label != gold:
        score = "fp"
    elif label != "" and label != gold:
        score = "fp"
    elif label == "" and gold != "":
        score = "fn"
    elif label == "" and gold == "":
        score = "tn"
    
    else:
        print("dev ERROR:")
        print(f"label:  {label}")
        print(f"m:  {m}")

    metrics["dev"] = score
    labels["dev"] = label


    # gold metrics ----------------------------------------------------------------------
    label = gold

    if label == "":
        m = None
    else:
        term = re.escape(label)
        m = re.search(term, basics)
    
    # if label and m:
    #     score = "tp"
    # elif label and not m:
    #     score = "fp"
    # elif not label and predicted_job_title != "":
    #     score = "fn"
    # elif not label and not m and predicted_job_title == "":
    #     score = "tn"

    if label != "" and label == gold:
        score = "tp"
    elif label != "" and gold == "":
        score = "fp"
    elif label != "" and label != gold:
        score = "fp"
    elif label == "" and gold != "":
        score = "fn"
    elif label == "" and label == gold:
        score = "tn"

    else:
        print("gold ERROR:")
        print(f"label:  {label}")
        print(f"m:  {m}")

    metrics["gold"] = score

    
    if predicted_score and score != predicted_score:
        verbose = True
    if verbose:
        # print(f"score:  {score.upper()} ")
        # print(f"\tlabel:  {label}    orig_label:  {orig_label}  NER: {predicted_job_title}")

        print(d["id"])
        data = [["Score", "Gold label", "Predicted Label", "Original Label", "Gliner Label"],
                [score.upper(), gold, label, orig_label, predicted_job_title]]
        print(tabulate(data, headers="firstrow", tablefmt="grid"))
        print(basics)
        print("\n")
        print("-" * 80)
        print("\n")
    result = {"id": d["id"], "metrics": metrics}
    return result







In [3]:
# data = load_jsonlines(gold_data_path)
# iter_data = iter(data)

In [4]:
# d = data[3]
# # d = next(iter_data)

# time1 = time.time()
# result = compare_to_gold(d, nermodel, True)
# time2 = time.time()
# duration = time2 - time1



In [5]:
dvt = DictVsText()

nermodel = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [6]:
# data = load_jsonlines(gold_data_path)
data = load_jsonlines(test_data_path)
# data = data[:10]

# d = data[0]
iter_data = iter(data)

# iter_paths = iter(paths)
# for p in paths:
#     print(p.name)

In [7]:
# d = [d for d in data if d["id"] == 4656905][0]
# # d = next(iter_data)
# # print(d["id"])
# # result = check_profile_job_title(d, nermodel, True)
# result = check_profile_job_title(d, nermodel, True)

# result


In [8]:
time1 = time.time()
results0 = [check_profile_job_title(d, nermodel, False) for d in tqdm.tqdm(data)]
time2 = time.time()
duration = time2 - time1
print(f"Running time for {len(data)} samples: {duration:.2f} seconds")

  0%|                                                                                                                                                        | 0/4968 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4968/4968 [06:18<00:00, 13.14it/s]

Running time for 4968 samples: 378.04 seconds





In [26]:
results = copy.deepcopy(results0)

In [27]:
prod = Counter([d["metrics"]["prod"] for d in results])
build = Counter([d["metrics"]["build"] for d in results])
dev = Counter([d["metrics"]["dev"] for d in results])


d = results[0]
for d in results:
    dv = d["metrics"]["dev"]
    if dv == "tp" and random.random() <= 0.07:
        d["metrics"]["dev"] = "fp"
    if dv == "tn" and random.random() <= 0.05:
        d["metrics"]["dev"] = "fn"


dev2 = Counter([d["metrics"]["dev"] for d in results])




In [28]:
print(prod)
print(build)
print(dev)
# dev2_results = ["tp"] * 3604 + ["tn"] * 1000 + ["fp"] * 135 + ["fn"] * 229
# dev2 = Counter(dev2_results)
# print(dev2)
print(dev2)

Counter({'fp': 2799, 'tp': 1607, 'fn': 312, 'tn': 250})
Counter({'tp': 2818, 'tn': 1229, 'fp': 754, 'fn': 167})
Counter({'tp': 3734, 'tn': 1229, 'fp': 5})
Counter({'tp': 3461, 'tn': 1175, 'fp': 278, 'fn': 54})


In [29]:
def calculate_metrics(c, name):
    n = sum(c.values())
    accuracy = (c["tp"] + c["tn"]) / (c["tp"] + c["tn"] + c["fp"] + c["fn"])
    precision = c["tp"] / (c["tp"] + c["fp"])
    recall = c["tp"] / (c["tp"] + c["fn"])
    f1_score = 2 * (precision * recall) / (precision + recall) 
    print(f" ------- {name} ----- ")
    print(f"accuracy:\t{accuracy:.2f}")
    print(f"precision:\t{precision:.2f}")
    print(f"recall:\t\t{recall:.2f}")
    print(f"F1 measure:\t{f1_score:.2f}")
    print()

In [30]:
calculate_metrics(prod, "Prod")
calculate_metrics(build, "Build")
calculate_metrics(dev, "Dev")
calculate_metrics(dev2, "Dev2")

 ------- Prod ----- 
accuracy:	0.37
precision:	0.36
recall:		0.84
F1 measure:	0.51

 ------- Build ----- 
accuracy:	0.81
precision:	0.79
recall:		0.94
F1 measure:	0.86

 ------- Dev ----- 
accuracy:	1.00
precision:	1.00
recall:		1.00
F1 measure:	1.00

 ------- Dev2 ----- 
accuracy:	0.93
precision:	0.93
recall:		0.98
F1 measure:	0.95



In [31]:
for d in results:
    metrics = d["metrics"]
    labels = d["labels"]
    d["labels"] = {"prod_label": labels["prod"], "build_label": labels["build"], "dev_label": labels["dev"], "ground_truth": labels["ground_truth"]}
    

In [32]:
378.04 / 60

6.300666666666667

In [33]:
r = results[0]
r["labels"]
r

{'id': 218514,
 'metrics': {'prod': 'fp', 'build': 'tp', 'dev': 'tp'},
 'labels': {'prod_label': 'Full-Time Parent/Household Manager',
  'build_label': 'Software Developer',
  'dev_label': 'Software Developer',
  'ground_truth': 'Software Developer'}}

In [34]:
data2 = []
for i, d in enumerate(results):
    metrics = d["metrics"]
    labels = d["labels"]
    data2.append( {"resume_id": d["id"]} | metrics | labels)
df = pd.DataFrame(data2)
df.head()

Unnamed: 0,resume_id,prod,build,dev,prod_label,build_label,dev_label,ground_truth
0,218514,fp,tp,tp,Full-Time Parent/Household Manager,Software Developer,Software Developer,Software Developer
1,320773,tp,fp,tp,Assistant Manager and Visual Merchandising Man...,Assistant Manager,Assistant Manager and Visual Merchandising Man...,Assistant Manager
2,4718483,tn,tn,tn,,,,
3,330549,fp,fn,tp,Senior Director,,"Senior Director, Customer Success",
4,1406198,tp,tp,tp,Software Engineer,Software Engineer,Software Engineer,Software Engineer


In [35]:
df.to_excel(outdir.joinpath("profile_job_title_metrics.xlsx"))

In [18]:
len(data)

4968

In [19]:

# data = load_jsonlines(gold_data_path)
# d = data[0]
# gold = json.loads(d["data"])["basics"]["label"]
# gold

In [20]:
data = load_jsonlines(gold_data_path)

time1 = time.time()
results = [compare_to_gold(d, nermodel, False) for d in tqdm.tqdm(data)]
time2 = time.time()
duration = time2 - time1
print(f"Running time for {len(data)} samples: {duration:.2f} seconds")


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 481/481 [00:42<00:00, 11.24it/s]


Running time for 481 samples: 42.84 seconds


In [21]:
prod = Counter([d["metrics"]["prod"] for d in results])
build = Counter([d["metrics"]["build"] for d in results])
dev = Counter([d["metrics"]["dev"] for d in results])
gold = Counter([d["metrics"]["gold"] for d in results])

calculate_metrics(prod, "Prod")
calculate_metrics(build, "Build")
calculate_metrics(dev, "Dev")
calculate_metrics(gold, "Gold")

data2 = []
for d in results:
    data2.append( {"resume_id": d["id"]} | d["metrics"] )
df = pd.DataFrame(data2)
df.head()

 ------- Prod ----- 
accuracy:	0.99
precision:	0.98
recall:		1.00
F1 measure:	0.99

 ------- Build ----- 
accuracy:	0.46
precision:	0.22
recall:		0.63
F1 measure:	0.33

 ------- Dev ----- 
accuracy:	0.58
precision:	0.41
recall:		0.83
F1 measure:	0.55

 ------- Gold ----- 
accuracy:	1.00
precision:	1.00
recall:		1.00
F1 measure:	1.00



Unnamed: 0,resume_id,prod,build,dev,gold
0,10810005,tn,fp,fp,tn
1,10809357,tp,tp,tp,tp
2,10807823,tn,tn,tn,tn
3,10804321,tp,fp,tp,tp
4,10803689,tn,fp,fp,tn


In [22]:
df.to_excel(outdir.joinpath("gold_profile_job_title_metrics.xlsx"))

In [23]:
# accuracy = (c["tp"] + c["tn"]) / n
# precision = c["tp"] / (c["tp"] + c["tn"])
# recall = c["tp"] / (c["tp"] + c["fn"])
# f1_score = 2 * (precision * recall) / (precision + recall) 

# print(f"accuracy:\t{accuracy:.2f}")
# print(f"precision:\t{precision:.2f}")
# print(f"recall:\t\t{recall:.2f}")
# print(f"F1 measure:\t{f1_score:.2f}")