In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import defaultdict
import networkx as nx
import time
import requests
import ast

In [87]:
x1 = pd.read_csv('comprehensive_medical_notes.csv')  
y1 = pd.read_csv('new_ground_truth.csv')

In [88]:
def evaluate_pairs(pred_pairs, ground_truth_clusters):
    true_pairs = set()
    for cluster in ground_truth_clusters:
        for a in range(len(cluster)):
            for b in range(a+1, len(cluster)):
                i,j = cluster[a], cluster[b]
                true_pairs.add((min(i,j), max(i,j)))

    all_pairs = pred_pairs.union(true_pairs)
    y_true = [1 if p in true_pairs else 0 for p in all_pairs]
    y_pred = [1 if p in pred_pairs  else 0 for p in all_pairs]

    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred,
        average='binary',
        zero_division=0
    )
    return {'precision': p, 'recall': r, 'f_score': f1}

In [89]:
class EntityResolutionPipeline:
    def __init__(self):
        self.data = None
        self.cleaned_data = None
        self.blocks = defaultdict(list)
        self.candidate_pairs = set()

    def set_data(self, data):
        self.data = data.copy()
        self.cleaned_data = data.copy()
        self.cleaned_data['cleaned_title'] = self.cleaned_data['important_notes'].apply(str)
        return self.data
    
    def clean_data(self):
        self.cleaned_data = self.data.copy()
        self.cleaned_data['cleaned_title'] = self.cleaned_data['important_notes']
        return self.cleaned_data
    
    def create_blocks(self):    
        self.blocks = defaultdict(list)
        
        for idx, row in self.cleaned_data.iterrows():
            tokens = row['cleaned_title'].split()
            for token in tokens:
                if len(token) > 2:  
                    self.blocks[token].append(row['patient_id'])
        
        return self.blocks

    
    def filter_blocks(self, tau):
        # max_blocks=5000
        filtered_blocks = {
            token: records
            for token, records in self.blocks.items()
            if 1 < len(records) < tau
        }

        # if max_blocks and len(filtered_blocks) > max_blocks:
        #     tokens_sorted = sorted(
        #         filtered_blocks.keys(),
        #         key=lambda t: len(filtered_blocks[t]),
        #         reverse=True,
        #     )
        #     keep = set(tokens_sorted[:max_blocks])
        #     filtered_blocks = {t: filtered_blocks[t] for t in keep}

        self.filtered_blocks = filtered_blocks

        return filtered_blocks

In [None]:
data = x1
pipeline = EntityResolutionPipeline()
pipeline.set_data(data)

In [91]:
filtered_blocks_global = []
def run_pipeline(tau, alpha, clean_the_data =True):  
    blocks = defaultdict(list)
    candidate_pairs = set()
    matches = set()
    clusters = []
    tau = 50
    alpha = 0.7
    
    start_time = time.time()
    pipeline = EntityResolutionPipeline()
    data = pd.read_csv('compiled_important_notes.csv')
    pipeline.set_data(data)
    ground_truth = pd.read_csv('new_primary_diagnosis.csv').values.tolist()
        
    if clean_the_data:
        data = pipeline.clean_data()

    # Blocking
    blocking_start = time.time()
    blocks = pipeline.create_blocks()
    filtered_blocks, candidates = pipeline.filter_blocks(tau)
    filtered_blocks_global = filtered_blocks
    blocking_time = time.time() - blocking_start

    
    return {
        'blocking': {
            'time': blocking_time,
        },
        'total_time': blocking_time, 
        'filtered_blocks_global': filtered_blocks_global

    }

In [None]:
best_results = run_pipeline(50, 0.7, False)

df = pd.read_csv('comprehensive_medical_notes.csv')
block_rows = []
filtered_blocks_global = best_results['filtered_blocks_global']


for token, patient_ids in filtered_blocks_global.items():
    matches = df[df['patient_id'].isin(patient_ids)]

    agg_dict = (
        matches
        .groupby('patient_id')['important_notes']
        .apply(lambda notes: ' '.join(notes.head(5)))
        .to_dict()
    )

    block_list_str = ','.join(map(str, patient_ids))
    all_notes_list = [agg_dict[pid] for pid in patient_ids if pid in agg_dict]
    all_notes_str = ' '.join(all_notes_list)

    block_rows.append({
        'block_list_str': block_list_str,
        'all_notes': all_notes_str
    })

result_df = pd.DataFrame(block_rows)
result_df.to_csv('filtered_blocks_notes.csv', index=False)

In [None]:

all_patients = pd.read_csv('compiled_important_notes.csv')['patient_id'].astype(int)

test_ids = all_patients.sample(frac=0.03, random_state=50).tolist()

blocks_df = pd.read_csv('filtered_blocks_notes.csv')
blocks_df['patient_list'] = blocks_df['block_list_str'].str.split(',')

exploded = (
    blocks_df
    .explode('patient_list')
    .rename(columns={'patient_list': 'patient_id'})
)
exploded['patient_id'] = exploded['patient_id'].astype(int)

test_set = exploded[exploded['patient_id'].isin(test_ids)][
    ['patient_id', 'block_list_str', 'all_notes']
].drop_duplicates('patient_id').reset_index(drop=True)

test_set.to_csv('er_test_set.csv', index=False)


In [None]:
API_KEY = ''
API_URL = 'https://api.perplexity.ai/chat/completions'
MODEL   = 'llama-3.1-sonar-large-128k-online'
INPUT_CSV  = 'er_test_set.csv'
OUTPUT_CSV = 'er_test_icd9_codes.csv'

df = pd.read_csv(INPUT_CSV)

def diagnose_icd9(notes: str) -> str:
    system_msg = {
        "role": "system",
        "content": (
            "You are a board‑certified medical coder. It is crucial that you are providing the right range in IC9 codes."
            "Return exactly one ICD‑9 code for these clinical notes. Only include the code and no other words with nothing following or before it."
        )
    }
    user_msg = {
        "role": "user",
        "content": notes
    }

    payload = {
        "model": MODEL,
        "messages": [system_msg, user_msg],
        "max_tokens": 10,
        "temperature": 0.0,
        "stream": False
    }
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    resp = requests.post(API_URL, json=payload, headers=headers)
    resp.raise_for_status()
    content = resp.json()['choices'][0]['message']['content'].strip()
    integer_code = content.split('.')[0]
    return integer_code

results = []
for _, row in df.iterrows():
    pid   = row['patient_id']
    notes = row['all_notes']
    try:
        code = diagnose_icd9(notes)
    except Exception:
        code = None
    results.append({"patient_id": pid, "icd9_code": code})

out_df = pd.DataFrame(results)
out_df.to_csv(OUTPUT_CSV, index=False)

In [None]:
er_df = pd.read_csv("er_test_icd9_codes.csv")
ground_truth_df = pd.read_csv("new_ground_truth.csv")

def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []

ground_truth_df['diagnosis'] = ground_truth_df['diagnosis'].apply(safe_eval)


merged_df = pd.merge(er_df, ground_truth_df, on='patient_id', how='left')
merged_df['icd9_code'] = merged_df['icd9_code'].astype(str)

def is_code_in_same_range(icd9_code, diagnosis_list):
    try:
        icd9_int = int(float(icd9_code)) 
        icd9_range = icd9_int // 100
        for diag in diagnosis_list:
            try:
                diag_int = int(float(diag))
                if diag_int // 100 == icd9_range:
                    return True
            except (ValueError, TypeError):
                continue
        return False
    except (ValueError, TypeError):
        return False

merged_df['is_match'] = merged_df.apply(
    lambda row: is_code_in_same_range(row['icd9_code'], row['diagnosis']) if isinstance(row['diagnosis'], list) else False,
    axis=1
)

accuracy = merged_df['is_match'].mean()
print(f"Accuracy: {accuracy:.2%}")
print(merged_df[['patient_id', 'icd9_code', 'diagnosis', 'is_match']])
