In [2]:
import csv
import pandas as pd

# ---------- Extract ----------
df = pd.read_json("input/job-descriptions.json")

# ---------- Transform ----------
# standardize column names
df.columns = df.columns.str.strip().str.lower()

# normalize job code: strip leading zeros
df["code"] = df["code"].astype(str).str.lstrip("0").astype(int)
df

Unnamed: 0,jurisdiction,code,title,description
0,sanbernardino,1297,Assistant Chief Probation Officer,"Definition\nUnder general direction, assists t..."
1,ventura,9111,Apcd Public Information Specialist,Definition\nUnder direction of the Public Info...
2,sanbernardino,10019,Assistant Director of Human Resources,"Definition\nUnder general direction, assists i..."
3,sanbernardino,1410,Assistant District Attorney,Definition\nUnder general administrative direc...
4,ventura,2181,Assistant Chief Probation Officer,Definition\nDEFINITION:\nUnder general directi...
5,ventura,80,Appraiser Trainee,Definition\nUnder general supervision (Trainee...
6,sdcounty,3697,Associate Meteorologist,.\nCLASSIFICATION PURPOSE AND DISTINGUISHING C...
7,sdcounty,265,Assistant Sheriff,.\nCLASSIFICATION PURPOSE AND DISTINGUISHING C...


In [3]:
def truncate_description(desc):
    """First clause, max 144 chars. Handle leading '.\n' -> ' ' (preserve leading space)."""
    s = desc.replace("\n", " ")
    if s.startswith(". "):
        s = " " + s[2:]  # ".\n" -> " "
    first = s.split(",")[0] if "," in s else s[:150]
    return first[:144] if len(first) > 144 else first


df["description"] = df["description"].apply(truncate_description)

# sort by jurisdiction, then code (match expected order)
df = df.sort_values(["jurisdiction", "code"]).reset_index(drop=True)
df

Unnamed: 0,jurisdiction,code,title,description
0,sanbernardino,1297,Assistant Chief Probation Officer,Definition Under general direction
1,sanbernardino,1410,Assistant District Attorney,Definition Under general administrative direction
2,sanbernardino,10019,Assistant Director of Human Resources,Definition Under general direction
3,sdcounty,265,Assistant Sheriff,CLASSIFICATION PURPOSE AND DISTINGUISHING CHA...
4,sdcounty,3697,Associate Meteorologist,CLASSIFICATION PURPOSE AND DISTINGUISHING CHA...
5,ventura,80,Appraiser Trainee,Definition Under general supervision (Trainee
6,ventura,2181,Assistant Chief Probation Officer,Definition DEFINITION: Under general direction
7,ventura,9111,Apcd Public Information Specialist,Definition Under direction of the Public Infor...


In [4]:
# select columns
df = df[["jurisdiction", "code", "title", "description"]]

# ---------- Load ----------
with open("output/job_descriptions_clean.csv", "w", newline="") as f:
    header_writer = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator="\n")
    header_writer.writerow(["jurisdiction", "code", "title", "description"])
    data_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
    for _, row in df.iterrows():
        data_writer.writerow(
            [row["jurisdiction"], row["code"], row["title"], row["description"]]
        )

print("✅ job_descriptions_clean.csv created")

✅ job_descriptions_clean.csv created
