In [16]:
import re, json
import pandas as pd

df = pd.read_parquet("../data/processed/teachers_db_cleaned.parquet")

def normalize_entities(entities):
    cleaned = []
    for ent in entities:
        word = ent["word"].strip()
        # Remove stray punctuation and unify case
        word = word.replace(".", "").replace(",", "").title()
        cleaned.append((ent["entity_group"], word))
    return cleaned

SECTION_HEADERS = {
    "corporate": r"(?:^|\b)(corporate experience)\b",
    "academic_exp": r"(?:^|\b)(academic experience)\b",
    "academic_bg": r"(?:^|\b)(academic background)\b",
}

def find_sections(text: str):
    text_l = text.lower()
    # find header start indices
    hits = []
    for key, pat in SECTION_HEADERS.items():
        m = re.search(pat, text_l)
        if m:
            hits.append((m.start(), key))
    hits.sort()
    # build span ranges
    spans = {}
    for i,(start,key) in enumerate(hits):
        end = hits[i+1][0] if i+1 < len(hits) else len(text_l)
        spans[key] = (start, end)
    # if none found, whole text is "intro"
    if not spans:
        spans["intro"] = (0, len(text_l))
    return spans


In [17]:
# assumes df["entities"] (with start/end) and normalize_entities() exist in this kernel
def bucket_entities(row):
    text = row["clean_text"]
    spans = find_sections(text)
    buckets = {k: {"ORG": [], "LOC": []} for k in spans.keys()}
    for ent in row["entities"]:
        s = ent.get("start", 0)
        for sec, (a, b) in spans.items():
            if a <= s < b:
                buckets[sec].setdefault(ent["entity_group"], [])
                buckets[sec][ent["entity_group"]].append(ent)
                break
    # normalize inside each bucket
    for sec in buckets:
        flat = []
        for typ in ("ORG","LOC"):
            if typ in buckets[sec]:
                flat.extend(buckets[sec][typ])
        buckets[sec]["normalized"] = normalize_entities(flat)
    return buckets

df["section_buckets"] = df.apply(bucket_entities, axis=1)


In [18]:
def to_properties(buckets):
    props = {}
    if "corporate" in buckets:
        corp = [e for e in buckets["corporate"]["normalized"]]
        props["Corporate Experience – Organization"] = sorted({t for lab,t in corp if lab=="ORG"})
        props["Corporate Experience – Location"]     = sorted({t for lab,t in corp if lab=="LOC"})
    if "academic_bg" in buckets:
        bg = [e for e in buckets["academic_bg"]["normalized"]]
        props["Academic Background – Organization"]  = sorted({t for lab,t in bg if lab=="ORG"})
        # (optional) degrees later
    if "academic_exp" in buckets:
        ae = [e for e in buckets["academic_exp"]["normalized"]]
        props["Academic Experience – Organization"]  = sorted({t for lab,t in ae if lab=="ORG"})
        props["Academic Experience – Location"]      = sorted({t for lab,t in ae if lab=="LOC"})
    return props

df["properties"] = df["section_buckets"].apply(to_properties)


In [19]:
# JSONL: one professor per line
out_jsonl = "../data/processed/professor_properties.jsonl"
with open(out_jsonl, "w", encoding="utf-8") as f:
    for props in df["properties"]:
        f.write(json.dumps(props, ensure_ascii=False) + "\n")
print("Saved →", out_jsonl)


Saved → ../data/processed/professor_properties.jsonl


In [20]:
# Inspect structure of one example
df["section_buckets"].iloc[0]


{'corporate': {'ORG': [{'end': 418,
    'entity_group': 'ORG',
    'score': 0.899153470993042,
    'start': 414,
    'word': 'A & am'},
   {'end': 428,
    'entity_group': 'ORG',
    'score': 0.9602067470550537,
    'start': 420,
    'word': 'M Studio'},
   {'end': 479,
    'entity_group': 'ORG',
    'score': 0.696322500705719,
    'start': 476,
    'word': '##ixi'},
   {'end': 531,
    'entity_group': 'ORG',
    'score': 0.9880414009094238,
    'start': 514,
    'word': 'Becquerel Capital'},
   {'end': 574,
    'entity_group': 'ORG',
    'score': 0.5379857420921326,
    'start': 571,
    'word': 'The'}],
  'LOC': [{'end': 435,
    'entity_group': 'LOC',
    'score': 0.9997650980949402,
    'start': 430,
    'word': 'Spain'},
   {'end': 476,
    'entity_group': 'LOC',
    'score': 0.7149252891540527,
    'start': 471,
    'word': 'Vidiv'},
   {'end': 487,
    'entity_group': 'LOC',
    'score': 0.9997214674949646,
    'start': 481,
    'word': 'Mexico'},
   {'end': 539,
    'entity_gro

In [21]:
from collections import Counter
Counter([k for row in df["section_buckets"].dropna() for k in row.keys()])

Counter({'academic_bg': 1031,
         'academic_exp': 968,
         'corporate': 874,
         'intro': 141})

In [22]:
df["section_buckets"].iloc[0]["corporate"]["normalized"][:5]

[('ORG', 'A & Am'),
 ('ORG', 'M Studio'),
 ('ORG', '##Ixi'),
 ('ORG', 'Becquerel Capital'),
 ('ORG', 'The')]

In [23]:
sum(len(v.get("normalized", [])) for row in df["section_buckets"].dropna() for v in row.values())

28342

In [24]:
POST_ORG_MAP = {
    "A & Am": "A&M Studio",
    "M Studio": "A&M Studio",
    "##Ixi": "Vidivixi",
}
DROP_TOKENS = {"The", "Design", "Academic Exp", "& Am", "Vidiv"}

def final_cleanup_buckets(buckets):
    for sec in buckets.values():
        fixed = []
        for typ, val in sec.get("normalized", []):
            if val in DROP_TOKENS:
                continue
            if typ == "ORG":
                val = POST_ORG_MAP.get(val, val)
            fixed.append((typ, val))
        sec["normalized"] = fixed
    return buckets

df["section_buckets"] = df["section_buckets"].apply(final_cleanup_buckets)


In [25]:
df["section_buckets"].iloc[0]["corporate"]["normalized"][:10]


[('ORG', 'A&M Studio'),
 ('ORG', 'A&M Studio'),
 ('ORG', 'Vidivixi'),
 ('ORG', 'Becquerel Capital'),
 ('LOC', 'Spain'),
 ('LOC', 'Mexico'),
 ('LOC', 'Mexico'),
 ('LOC', 'Hub'),
 ('LOC', 'Hong Kong')]

In [26]:
def filter_junk(buckets):
    bad_words = {"The", "Design", "Hub", "Vidiv", "Academ", "Academic", "Exp"}
    for sec in buckets.values():
        if "normalized" in sec:
            sec["normalized"] = [e for e in sec["normalized"] if e[1] not in bad_words]
    return buckets

df["section_buckets"] = df["section_buckets"].apply(filter_junk)


In [27]:
output_path = "../data/processed/teachers_db_structured.parquet"
df.to_parquet(output_path, index=False)
print(f"Saved structured dataset to {output_path}")


Saved structured dataset to ../data/processed/teachers_db_structured.parquet
