In [None]:
# Summary:
# - We show 3 ways to map messy CRF free-text to SDTM codes (sex, race).
#   1) Fuzzy / dictionary: explicit synonym lists + simple typo matching (fast, explainable).
#   2) KNN: uses TF-IDF feature vectors and labels of nearest examples.
#   3) Random Forest: tree ensemble on TF-IDF features (often stronger with more data).
# - Workflow: try fuzzy first (if confident) -> else fallback to ML predictions.
# - This notebook uses tiny dummy data for demonstration only. For production use lots more labeled samples.


In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl (8.9 MB)
     ---------------------------------------- 8.9/8.9 MB 10.5 MB/s eta 0:00:00
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ------------------------------------- 308.4/308.4 kB 18.6 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.8.0
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl (41.3 MB)
     ---------------------------------------- 41.3/41.3 MB 9.3 MB/s eta 0:00:00
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.7.2 scipy-1.15.3 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 25.3
[notice] To update, run: C:\Users\VATTI VAMSHI\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [6]:
# Cell 2: imports & helpers
import re
import difflib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# Helper Functions-

def clean_text(s):
    """Simple normalizer: lower case, remove punctuations, collapse spaces."""
    if s is None:
        return ""
    s = str(s).lower().strip()
    # keep alphanumeric and spaces, replace other chars with space
    s = re.sub(r"[^a-z0-9\s\-]", " ", s)
    s = re.sub(r"[\-\_]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [8]:
#  Create dummy dataset

data = [
    # (free_text, sex_label, race_label)
    ("male", "M", "White"),
    ("man", "M", "White"),
    ("men", "M", "White"),
    ("m", "M", "White"),
    ("Male (biological)", "M", "White"),
    ("female", "F", "Asian"),
    ("female (cis)", "F", "Asian"),
    ("woman", "F", "Black"),
    ("girl", "F", "Black"),
    ("lady", "F", "Black"),
    ("f", "F", "Asian"),
    ("femaile", "F", "White"),    # typo
    ("malee", "M", "Asian"),      # typo
    ("doesn't say - M", "M", "Black"),
    ("Prefer not to say", None, "White"),  # ambiguous sex
    ("", None, "Asian"),          # blank sex
    ("non-binary", "O", "Other"),
    ("nb", "O", "Other"),
    ("trans male", "M", "Other"),
    ("trans female", "F", "Other"),
    # race examples (messy)
    ("Caucasian", "M", "White"),
    ("white", "F", "White"),
    ("Whtie", "M", "White"),      # typo
    ("European", "F", "White"),
    ("asian", "M", "Asian"),
    ("Indian", "F", "Asian"),
    ("chinese", "M", "Asian"),
    ("Black african", "F", "Black"),
    ("african american", "M", "Black"),
    ("blk", "F", "Black"),
    ("mixed race", "O", "Other"),
    ("other", "O", "Other"),
    ("pacific islander", "O", "Other"),
]

df = pd.DataFrame(data, columns=["free_text", "sex_label", "race_label"])
df["clean"] = df["free_text"].apply(clean_text)

# show dataset
df


Unnamed: 0,free_text,sex_label,race_label,clean
0,male,M,White,male
1,man,M,White,man
2,men,M,White,men
3,m,M,White,m
4,Male (biological),M,White,male biological
5,female,F,Asian,female
6,female (cis),F,Asian,female cis
7,woman,F,Black,woman
8,girl,F,Black,girl
9,lady,F,Black,lady


In [9]:
# Cell 4: fuzzy / dictionary mapping
sex_synonyms = {
    "male": "M", "man": "M", "men": "M", "m": "M", "trans male": "M", "trans_male": "M",
    "female": "F", "woman": "F", "girl": "F", "lady": "F", "f": "F", "trans female": "F", "trans_female": "F",
    "non binary": "O", "non-binary": "O", "nb": "O", "other": "O", "prefer not to say": None
}

race_synonyms = {
    "white": "White", "caucasian": "White", "european": "White", "whtie": "White",
    "asian": "Asian", "indian": "Asian", "chinese": "Asian",
    "black": "Black", "african": "Black", "african american": "Black", "blk": "Black",
    "mixed": "Other", "mixed race": "Other", "other": "Other", "pacific islander": "Other"
}



In [10]:
def fuzzy_map(value, synonyms_dict, cutoff=0.6):
    """Try dictionary + token matching + difflib close matches. Return canonical value or None."""
    if value is None:
        return None
    v = clean_text(value)
    if v == "":
        return None
    # exact full-match
    if v in synonyms_dict:
        return synonyms_dict[v]
    # token-level exact
    tokens = v.split()
    for t in tokens:
        if t in synonyms_dict:
            return synonyms_dict[t]
    # close-match on full string
    keys = list(synonyms_dict.keys())
    cand = difflib.get_close_matches(v, keys, n=1, cutoff=cutoff)
    if cand:
        return synonyms_dict[cand[0]]
    # close-match on tokens
    for t in tokens:
        cand = difflib.get_close_matches(t, keys, n=1, cutoff=cutoff)
        if cand:
            return synonyms_dict[cand[0]]
    return None

# Apply fuzzy mapping and show
df["fuzzy_sex"] = df["clean"].apply(lambda x: fuzzy_map(x, sex_synonyms))
df["fuzzy_race"] = df["clean"].apply(lambda x: fuzzy_map(x, race_synonyms))
df[["free_text", "clean", "sex_label", "fuzzy_sex", "race_label", "fuzzy_race"]]

Unnamed: 0,free_text,clean,sex_label,fuzzy_sex,race_label,fuzzy_race
0,male,male,M,M,White,
1,man,man,M,M,White,
2,men,men,M,M,White,
3,m,m,M,M,White,
4,Male (biological),male biological,M,M,White,
5,female,female,F,F,Asian,
6,female (cis),female cis,F,F,Asian,Asian
7,woman,woman,F,F,Black,
8,girl,girl,F,F,Black,
9,lady,lady,F,F,Black,


In [11]:
# vectorize and train ML models for sex and race
# Sex classifier (drop ambiguous labels for training)
df_sex = df.dropna(subset=["sex_label"]).copy()
X_sex_raw = df_sex["clean"].values
y_sex = df_sex["sex_label"].values

In [12]:
vectorizer_sex = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X_sex = vectorizer_sex.fit_transform(X_sex_raw)

In [13]:
# Ensure we have at least two samples per class for stratify; with this toy data it's okay.
Xsx_train, Xsx_test, ysx_train, ysx_test = train_test_split(X_sex, y_sex, test_size=0.30, random_state=42, stratify=y_sex)

In [14]:
knn_sex = KNeighborsClassifier(n_neighbors=3)
knn_sex.fit(Xsx_train, ysx_train)
rf_sex = RandomForestClassifier(n_estimators=200, random_state=42)
rf_sex.fit(Xsx_train, ysx_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
# Evaluate sex models
ys_knn = knn_sex.predict(Xsx_test)
ys_rf  = rf_sex.predict(Xsx_test)
print("Sex - KNN accuracy:", accuracy_score(ysx_test, ys_knn))
print("Sex - RandomForest accuracy:", accuracy_score(ysx_test, ys_rf))
print("\nSex - KNN classification report:\n", classification_report(ysx_test, ys_knn, zero_division=0))
print("Sex - RF classification report:\n", classification_report(ysx_test, ys_rf, zero_division=0))


Sex - KNN accuracy: 0.5
Sex - RandomForest accuracy: 0.5

Sex - KNN classification report:
               precision    recall  f1-score   support

           F       0.44      1.00      0.62         4
           M       1.00      0.25      0.40         4
           O       0.00      0.00      0.00         2

    accuracy                           0.50        10
   macro avg       0.48      0.42      0.34        10
weighted avg       0.58      0.50      0.41        10

Sex - RF classification report:
               precision    recall  f1-score   support

           F       0.44      1.00      0.62         4
           M       1.00      0.25      0.40         4
           O       0.00      0.00      0.00         2

    accuracy                           0.50        10
   macro avg       0.48      0.42      0.34        10
weighted avg       0.58      0.50      0.41        10



In [16]:
# Race classifier (drop ambiguous: none - in our data race_label is present for most rows)
df_race = df.dropna(subset=["race_label"]).copy()
X_race_raw = df_race["clean"].values
y_race = df_race["race_label"].values


In [17]:
vectorizer_race = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X_race = vectorizer_race.fit_transform(X_race_raw)

In [18]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_race, y_race, test_size=0.30, random_state=42, stratify=y_race)


In [19]:
knn_race = KNeighborsClassifier(n_neighbors=3)
knn_race.fit(Xr_train, yr_train)
rf_race = RandomForestClassifier(n_estimators=200, random_state=42)
rf_race.fit(Xr_train, yr_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# Evaluate race models
yr_knn = knn_race.predict(Xr_test)
yr_rf  = rf_race.predict(Xr_test)
print("\nRace - KNN accuracy:", accuracy_score(yr_test, yr_knn))
print("Race - RandomForest accuracy:", accuracy_score(yr_test, yr_rf))
print("\nRace - KNN classification report:\n", classification_report(yr_test, yr_knn, zero_division=0))
print("Race - RF classification report:\n", classification_report(yr_test, yr_rf, zero_division=0))



Race - KNN accuracy: 0.4
Race - RandomForest accuracy: 0.3

Race - KNN classification report:
               precision    recall  f1-score   support

       Asian       0.00      0.00      0.00         3
       Black       0.00      0.00      0.00         2
       Other       0.33      0.50      0.40         2
       White       0.43      1.00      0.60         3

    accuracy                           0.40        10
   macro avg       0.19      0.38      0.25        10
weighted avg       0.20      0.40      0.26        10

Race - RF classification report:
               precision    recall  f1-score   support

       Asian       0.00      0.00      0.00         3
       Black       0.00      0.00      0.00         2
       Other       0.00      0.00      0.00         2
       White       0.30      1.00      0.46         3

    accuracy                           0.30        10
   macro avg       0.07      0.25      0.12        10
weighted avg       0.09      0.30      0.14        10



In [22]:
#unified prediction helper
def predict_all(text, fuzzy_first=True):
    """
    Returns a dict with:
      - fuzzy_sex, knn_sex, rf_sex
      - fuzzy_race, knn_race, rf_race
    fuzzy_first: not used to change outputs here, but typical workflow would consider fuzzy value as preferred if present.
    """
    t_clean = clean_text(text)
    out = {}
    # fuzzy
    out["fuzzy_sex"] = fuzzy_map(t_clean, sex_synonyms)
    out["fuzzy_race"] = fuzzy_map(t_clean, race_synonyms)
    # ML predictions (wrap in try in case text vectorizer can't handle empty)
    try:
        vx_sex = vectorizer_sex.transform([t_clean])
        out["knn_sex"] = knn_sex.predict(vx_sex)[0]
        out["rf_sex"]  = rf_sex.predict(vx_sex)[0]
    except Exception:
        out["knn_sex"] = None
        out["rf_sex"] = None
    try:
        vx_race = vectorizer_race.transform([t_clean])
        out["knn_race"] = knn_race.predict(vx_race)[0]
        out["rf_race"]  = rf_race.predict(vx_race)[0]
    except Exception:
        out["knn_race"] = None
        out["rf_race"] = None
    return out

# quick test
print(predict_all("M"))
print(predict_all("Whtie"))
print(predict_all("trans female"))


{'fuzzy_sex': 'M', 'fuzzy_race': None, 'knn_sex': 'F', 'rf_sex': 'F', 'knn_race': 'Other', 'rf_race': 'White'}
{'fuzzy_sex': None, 'fuzzy_race': 'White', 'knn_sex': 'M', 'rf_sex': 'M', 'knn_race': 'White', 'rf_race': 'White'}
{'fuzzy_sex': 'F', 'fuzzy_race': None, 'knn_sex': 'F', 'rf_sex': 'F', 'knn_race': 'Asian', 'rf_race': 'Other'}


In [23]:
# CRF test strings (mix of typos, abbreviations, mixed text, blank, ambiguous)
test_texts = [
    "male",
    "man",
    "Men",
    "M",
    "MALEE",            # typo
    "female",
    "Femaile",         # typo
    "woman (self-identified)",
    "girl",
    "lady",
    "non-binary",
    "nb",
    "trans male",
    "trans-female",
    "Prefer not to say",
    "",
    "White",
    "Whtie",           # typo
    "Caucasian/EU",
    "Asian - Indian",
    "chinese",
    "black african",
    "african american",
    "blk",
    "mixed race",
    "pacific islander",
    "unknown",
    "patient says male but wrote 'lady' in notes"  # mixed/confusing
]

rows = []
for t in test_texts:
    preds = predict_all(t)
    rows.append({
        "input": t,
        "fuzzy_sex": preds["fuzzy_sex"],
        "knn_sex": preds["knn_sex"],
        "rf_sex": preds["rf_sex"],
        "fuzzy_race": preds["fuzzy_race"],
        "knn_race": preds["knn_race"],
        "rf_race": preds["rf_race"],
    })

df_tests = pd.DataFrame(rows)
# pretty print small table
pd.set_option("display.max_colwidth", 100)
df_tests


Unnamed: 0,input,fuzzy_sex,knn_sex,rf_sex,fuzzy_race,knn_race,rf_race
0,male,M,M,M,,White,White
1,man,M,M,M,,White,White
2,Men,M,F,F,,White,White
3,M,M,F,F,,Other,White
4,MALEE,M,F,F,,White,White
5,female,F,F,F,,Asian,Asian
6,Femaile,F,F,F,,White,White
7,woman (self-identified),F,F,F,,Black,Black
8,girl,F,F,F,,Black,Black
9,lady,F,F,F,,White,White


In [25]:
# Cell 8: save results & quick notes
out_path = "sdtm_crfmapping.csv"
try:
    df_tests.to_csv(out_path, index=False)
    print("Saved predictions to:", out_path)
except Exception as e:
    print("Could not save file (likely environment path issue):", e)

# Short notes for users:
notes = """
Notes:
- Fuzzy mapping is preferred when it returns a clear value because it's deterministic and explainable.
- ML models (KNN/RF) need more labeled data to generalize well.
- Mixed/confusing strings (e.g., 'patient says male but wrote lady') need manual review or more complex rule/logic to resolve.
- To improve: expand synonym dictionaries, add many labeled CRF examples, consider sequence models / context features or ensemble voting.
"""
print(notes)


Saved predictions to: sdtm_crfmapping.csv

Notes:
- Fuzzy mapping is preferred when it returns a clear value because it's deterministic and explainable.
- ML models (KNN/RF) need more labeled data to generalize well.
- Mixed/confusing strings (e.g., 'patient says male but wrote lady') need manual review or more complex rule/logic to resolve.
- To improve: expand synonym dictionaries, add many labeled CRF examples, consider sequence models / context features or ensemble voting.

