In [9]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# -------------------------
# NLTK setup
# -------------------------
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

# -------------------------
# Load CSV
# -------------------------
df = pd.read_csv("D:\\document_references_fhir_raw.csv")  # adjust filename

# Assume CSV already has 'patient_id' and 'note_text'
# -------------------------
# NLP preprocessing
# -------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # keep numbers for labs
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['note_text'].apply(clean_text)

print(df.head())

# -------------------------
# CKD stage assignment function
# -------------------------
def assign_ckd_stage(egfr):
    if egfr is None:
        return None
    elif egfr >= 90:
        return 1
    elif 60 <= egfr < 90:
        return 2
    elif 45 <= egfr < 60:
        return '3a'
    elif 30 <= egfr < 45:
        return '3b'
    elif 15 <= egfr < 30:
        return 4
    elif egfr < 15:
        return 5


# -------------------------
# CKD keywords and symptoms
# -------------------------
ckd_terms = ['chronic kidney disease', 'ckd', 'egfr', 'creatinine', 'renal insufficiency', 'kidney failure', 'dialysis']
ckd_symptoms = ['fatigue', 'swelling', 'edema', 'nausea', 'vomiting', 'shortness of breath', 'high blood pressure', 'hypertension']

# Vectorized search using regex for speed
df['ckd_mentions'] = df['cleaned_text'].str.contains('|'.join(ckd_terms), case=False, regex=True)
df['ckd_symptoms'] = df['cleaned_text'].str.contains('|'.join(ckd_symptoms), case=False, regex=True)
df['ckd_with_symptoms'] = df['ckd_mentions'] & df['ckd_symptoms']



# Save results
df.to_csv("ckd_nlp_results_with_staging.csv", index=False)

# -------------------------
# Extract numeric lab values
# -------------------------
def extract_egfr(text):
    match = re.search(r'\b(eGFR|egfr)\s*[:=]?\s*(\d+)', str(text), re.IGNORECASE)
    return int(match.group(2)) if match else None

def extract_creatinine(text):
    match = re.search(r'\b(creatinine|cr)\s*[:=]?\s*(\d+(\.\d+)?)', str(text), re.IGNORECASE)
    return float(match.group(2)) if match else None

df['egfr'] = df['note_text'].apply(extract_egfr)
df['creatinine'] = df['note_text'].apply(extract_creatinine)

# Assign CKD stage based on eGFR
df['ckd_stage'] = df['egfr'].apply(assign_ckd_stage)

# Preview
print(df[['patient_uuid', 'egfr', 'ckd_stage']].head())

# -------------------------
# Select relevant columns
# -------------------------
output_w_notes_df = df[['patient_uuid', 'note_text', 'ckd_mentions', 'ckd_symptoms', 'ckd_with_symptoms', 'egfr', 'creatinine']]
output_no_notes_df = df[['patient_uuid', 'ckd_mentions', 'ckd_symptoms', 'ckd_with_symptoms', 'egfr', 'creatinine']]

# Save results
output_w_notes_df.to_csv("ckd_nlp_results_with_labs.csv", index=False)
output_no_notes_df.to_csv("ckd_nlp_results_no_notes.csv", index=False)

# Preview
print(output_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tonim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tonim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tonim\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                           patient_uuid                 document_reference_id  \
0  16ec7045-4e43-cb9e-0445-f43732e3c63c  442748b2-7618-75f6-3640-026ca9100895   
1  16ec7045-4e43-cb9e-0445-f43732e3c63c  46c4500c-0955-39f7-35e5-dba7ac84b321   
2  16ec7045-4e43-cb9e-0445-f43732e3c63c  7ca96006-8459-01bf-f10f-512bc23d3332   
3  16ec7045-4e43-cb9e-0445-f43732e3c63c  33262ab9-30a4-518d-cec9-5a325a937318   
4  16ec7045-4e43-cb9e-0445-f43732e3c63c  391d6089-7983-c098-678b-ed475e2b7d1e   

   title                                          note_text  \
0    NaN  \n2019-06-08\n\n# Chief Complaint\n- Frequent ...   
1    NaN  \n2019-06-05\n\n# Chief Complaint\n- Frequent ...   
2    NaN  \n2019-06-16\n\n# Chief Complaint\n- Frequent ...   
3    NaN  \n2019-06-23\n\n# Chief Complaint\n- Frequent ...   
4    NaN  \n2019-06-30\n\n# Chief Complaint\n- Frequent ...   

                                        cleaned_text  
0  2019 06 08 chief complaint frequent urination ...  
1  2019 06 05 chief comp

In [3]:
import nltk
print(nltk.data.path)


['C:\\Users\\tonim/nltk_data', 'C:\\Users\\tonim\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data', 'C:\\Users\\tonim\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data', 'C:\\Users\\tonim\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data', 'C:\\Users\\tonim\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
