In [None]:
import pandas as pd
import numpy as np

In [None]:
lynch_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/lynch_full_icd.csv')
lynch_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/lynch_full_snomed.csv')


crc_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/crc_full_icd.csv')
crc_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/crc_full_snomed.csv')


ec_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/ec_full_icd.csv')
ec_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/ec_full_snomed.csv')


ov_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/ov_full_icd.csv')
ov_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/ov_full_snomed.csv')


pc_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/pancreatic_full_icd.csv')
pc_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/pancreatic_full_snomed.csv')


rc_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/rc_full_icd.csv')
rc_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/rc_full_snomed.csv')


gc_icd_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/gastric_cancer_full_icd.csv')
gc_snomed_df = pd.read_csv('/data/volume02/yuba/HC_STRUC/gastric_cancer_full_snomed.csv')

**DATABASE CONSIST OF 2 DIFFERENT DIAGNOSIS TABLES**
- Required to pass through function before merging for full diagnosis dataframe

In [None]:
def unique_patients(icd_df, snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=None):

    icd_df_unique = icd_df.drop_duplicates(subset=[id_col, enc_col], keep='first').copy()
    snomed_df_unique = snomed_df.drop_duplicates(subset=[id_col, enc_col], keep='first').copy()
    
    #overlap logic
    overlap = pd.merge(
        icd_df_unique[[id_col, enc_col]],
        snomed_df_unique[[id_col, enc_col]],
        on=[id_col, enc_col],
        how='inner'
    )
    
    n_patients = overlap[id_col].nunique()
    print(f"{n_patients} unique patient(s) has overlapping encounters and will be dropped.")
    
    if not overlap.empty:
        snomed_df_unique = snomed_df_unique.merge(overlap, on=[id_col, enc_col], how='left', indicator=True)
        snomed_df_unique = snomed_df_unique[snomed_df_unique['_merge'] == 'left_only']
        snomed_df_unique = snomed_df_unique.drop(columns=['_merge'])
        
    merged_df = pd.concat([icd_df_unique, snomed_df_unique], ignore_index=True)
    
    if int_cols:
        for col in int_cols:
            if col in merged_df.columns:
                merged_df[col] = merged_df[col].astype('Int64')
    
    return merged_df
        
    #return icd_df_unique.reset_index(drop=True), snomed_df_unique.reset_index(drop=True)

In [None]:
crc_df = unique_patients(crc_icd_df, crc_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

ec_df = unique_patients(ec_icd_df, ec_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

ov_df = unique_patients(ov_icd_df, ov_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

pc_df = unique_patients(pc_icd_df, pc_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

rc_df = unique_patients(rc_icd_df, rc_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

gc_df = unique_patients(gc_icd_df, gc_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])

lynch_df = unique_patients(lynch_icd_df, lynch_snomed_df, id_col = 'PATIENT_IDENTIFIER', enc_col = 'ENCOUNTER_KEY', int_cols=['DIAGNOSIS_CODE'])


**CREATE MAPPING FOR LYNCH SYNDROM ASSOCIATED DIAGNOSIS**

In [None]:
icd_prefix_map = {
    "Colon Cancer": ["C18", "C19", "C20"],
    "Endometrial Cancer": ["C54", "C55"],
    "Small Intestine": ["C17"],
    "Gastric Cancer": ["C16"],
    "Ovarian Cancer": ["C56", "C57", "C48"],
    "Pancreatic Cancer": ["C25"],
    "Urothelial Carcinoma": ["C65", "C66", "C67", "C68"],
    "Cholangiocarcinoma": ["C221"],
    "Gliolastoma": ["C71"],
    "Sebaceous Gland Tumor": ["C44"]
}

snomed_map = {
    "Colon Cancer": [363406005, 1286877004, 3746849018, 1228486017, 239932014, 3288335015, 3288737014, 
                     3288993017, 3288598016, 3288712016, 3289030012, 3288530014, 1228536014, 3290094014, 
                     1217954019, 1217953013, 3035992013, 3036828014, 3037442018, 2535205018, 380130011, 2535206017, 
                     2900489012, 2160192016, 1217953013, 1228535013, 1216462014, 1228624010, 1228487014, 403809019, 
                     239912013, 2643505019, 2643506018, 2643508017, 2643507010, 2915311014, 3790880010, 403832015, 
                     239933016, 288678012, 379090011, 2643851011, 510572019, 510954015],
    "Endometrial Cancer": [371973000, 1229105017, 3012523015, 403928010, 240022012, 379735014, 3499291018, 3499294014, 
                           3636679013, 207103018, 192334017, 3333071010, 3333074019, 3333068019, 3333065016, 3499293015, 
                           3499295010, 3312499013, 3852376016, 3636681010, 3333083012, 2994977010, 510605012, 205166013],
    "Small Intestine": [363509000, 2160191011, 446866018, 3481647011, 3439176010, 510704017, 510592014, 510728015, 379065014,
                       2160191011, 1228534012, 511040016, 379135019, 379136018, 48233017, 510427012, 482634011, 510428019, 
                       3325842015, 1210556012, 510426015, 1217991013, 482804016],
    "Gastric Cancer": [1228485018, 1228483013, 2772105011, 288632016, 1228484019, 511086019, 380125012, 1229145014, 380124011, 
                      3306822012, 1228484019, 511086019, 380125012, 1228483013, 511088018, 380126013, 3040651019, 1288485018, 
                      63369016, 450902017, 3325637014, 3307808019, 3307809010, 1210719018, 380127016, 239904017, 2160194015, 
                      36635502019, 3662514010, 1210596014, 511083010],
    "Ovarian Cancer": [363443007, 363444001, 94091004, 1228569014, 3035729016, 3012579018, 3012586014, 3012567018, 3012633017,
                       1228570010, 3012577016, 3012630019, 3012583018, 3012589019, 2643597016, 510637013, 3307072018, 3012796019, 
                       3307070014, 3307071013, 510864011, 1210694017, 2647879018, 1210573014, 413201016, 482686019],
    "Pancreatic Cancer": [363418001, 255088001, 1228546011, 510885011, 2990560014, 3800437019, 3760303017, 1228545010, 
                          510886012, 288723014, 3800446013, 510890014, 1228547019, 151015, 510884010],
    "Urothelial Carcinoma": [363458004, 363457009, 1228587012, 45320010, 215394019, 183328018, 1219788012, 3753977019,
                            3526066013, 3754974014, 3323177015, 3323175011, 510460010, 403951010, 240061016, 1786810016, 
                            1228584017],
    "Cholangiocarcinoma": [70179006, 116557016, 116560011, 455768011, 395734011, 1228489012, 1210559017, 3326323019, 3970195012],
    "Gliolastoma": [1163375002, 105759017, 2842007014, 105761014, 1487417014, 2840665013, 413144010, 2839701019, 413145011, 58840015, 
                   291833017, 3475170019, 3475171015, 3475174011, 3475169015, 2973132012, 1232485016],
    "Sebaceous Gland Tumor": [307599002, 92707003, 307599002, 90974013, 450909014, 347405018]
}

In [None]:
def assign_cancer_category_prefix(df, icd_prefix_map, snomed_map, icd_cols=None, snomed_col='DIAGNOSIS_CODE'):
    
    if icd_cols is None:
        icd_cols = ['ICD9_PRI_DIAG', 'ICD9CM_PRI_DIAG']
    
    df = df.copy()
    df['Lynch_Type'] = pd.NA
    
    for category, prefixes in icd_prefix_map.items():
        prefix_tuple = tuple(prefixes)
        mask = pd.Series(False, index=df.index)
        for col in icd_cols:
            if col in df.columns:
                col_series = df[col].fillna('').astype(str)
                mask |= col_series.str.startswith(prefix_tuple)
        df.loc[mask, 'Lynch_Type'] = category
    
    if snomed_col in df.columns:
        
        if all(isinstance(code, int) for codes in snomed_map.values() for code in codes):
            df[snomed_col] = pd.to_numeric(df[snomed_col], errors='coerce').astype('Int64')
            
        for category, codes in snomed_map.items():
            df.loc[df[snomed_col].isin(codes), 'Lynch_Type'] = category
    
    return df

In [None]:
lynch_categoried_df = assign_cancer_category_prefix(lynch_df, icd_prefix_map, snomed_map, icd_cols=None, snomed_col='DIAGNOSIS_CODE')

In [None]:
lynch_categoried_df.head(5)

In [None]:
lynch_categoried_df['ENCOUNTER_DATE']  = pd.to_datetime(lynch_categoried_df['ENCOUNTER_DATE'])

In [None]:
lynch_categoried_df['year'] = lynch_categoried_df['ENCOUNTER_DATE'].dt.year

In [None]:
#Function for cancer + 2 additional lynch syndrome criteria

def filter_additional_count(df, id_col, category_col, primary_category, min_other_count, year_col='year', return_primary_only=True):
      
    summary = df.groupby(id_col)[category_col].agg(
        has_primary=lambda x: (x == primary_category).any(),
        other_count=lambda x: (x != primary_category).sum()
    )
    
    eligible_ids = summary[
        (summary['has_primary']) & (summary['other_count'] >= min_other_count)
    ].index
    
    print(f"{len(eligible_ids)} patient(s) with '{primary_category}' and >= {min_other_count} other types.")
    
    mask = df[id_col].isin(eligible_ids)
    if return_primary_only:
        mask &= df[category_col] == primary_category
        
    result_df = df[mask].reset_index(drop=True)
    
    
    if year_col in df.columns:
        yearly_counts = (
            result_df.drop_duplicates(subset=[id_col, year_col]).groupby(year_col)[id_col].nunique().reindex(range(2017, 2022), fill_value=0)
        )
        
        for year, count in yearly_counts.items():
            print(f"Year {year}: {count}")
        
    
    return result_df
    
    

In [None]:
#Function for complex colorectal cancer criteria

def filter_crc_complex(lynch_categoried_df, id_col, category_col, date_col, year_col='year'):
    df = lynch_categoried_df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    
    eligible_ids = set()
    grouped = df.groupby(id_col)
    
    for pid, group in grouped:
        crc_dates = group.loc[group[category_col] == 'Colon Cancer', date_col].sort_values()
        
        other_count = (group[category_col] != 'Colon Cancer').sum()
        
        meets_repeat = False
        dates_list = crc_dates.tolist()
        for i in range(len(dates_list) - 1):
            if dates_list[i+1] >= dates_list[i] + pd.DateOffset(months=6):
                meets_repeat=True
                break
            
        if other_count >= 1 or meets_repeat:
            eligible_ids.add(pid)
     
    subset = df[
        df[id_col].isin(eligible_ids) & (df[category_col] == 'Colon Cancer')].reset_index(drop=True)
     
    total_rows = subset.shape[0]
    
    unique_pat = subset[id_col].nunique()
        
    print(f"{unique_pat} patient(s) with 'Colon Cancer' and " 
             f"either >= 1 other Lynch type or >= 2 CRC cases >= 6 months apart.")
    
    if year_col not in subset.columns:
        subset[year_col] = subset[date_col].dt.year
        
    yearly_counts = (
            subset.drop_duplicates(subset=[id_col, year_col]).groupby(year_col)[id_col].nunique().reindex(range(2017, 2022), fill_value=0)
        )
        
    for year, count in yearly_counts.items():
        print(f"Year {year}: {count}")
        
    
    return subset

**Assigning Diagnosis column for easier filtering**

# Multi Cancer Criteria Filtering 

**Filter 1: Endometrial Cancer and a Second Lynch syndrome-associated cancer in the same person**

In [None]:
ec_s = filter_additional_count(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', 
                              primary_category='Endometrial Cancer', min_other_count=1, return_primary_only=False)

In [None]:
ec_s_pat = ec_s['PATIENT_IDENTIFIER'].unique()
ec_s_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': ec_s_pat})
ec_s_pat_df

**Filter 2: Ovarian Cancer and Second Lynch Syndrome-associated cancer in the same person**

In [None]:
ov_s = filter_additional_count(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', 
                              primary_category='Ovarian Cancer', min_other_count=1, return_primary_only=False)

In [None]:
ov_s_pat = ov_s['PATIENT_IDENTIFIER'].unique()
ov_s_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': ov_s_pat})
ov_s_pat_df

**Filter 3: Gastric Cancer and hisotry of a Second Lynch Syndrome-associated cancer in the same person**

In [None]:
gc_s = filter_additional_count(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', 
                              primary_category='Gastric Cancer', min_other_count=1, return_primary_only=False)

In [None]:
gc_s_pat = gc_s['PATIENT_IDENTIFIER'].unique()
gc_s_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': gc_s_pat})
gc_s_pat_df

**Filter 4: Pancreatic Cancer and 2 additional case of Lynch Syndrome-associated cancer in the same person**

In [None]:
pc_add = filter_additional_count(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', 
                              primary_category='Pancreatic Cancer', min_other_count=2, return_primary_only=False)

In [None]:
pc_add_pat = pc_add['PATIENT_IDENTIFIER'].unique()
pc_add_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': pc_add_pat})
pc_add_pat_df

**Filter 5: Renal Cancer and 2 additional case of Lynch Syndrome-associated cancer in the same person**

In [None]:
rc_add = filter_additional_count(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', 
                              primary_category='Urothelial Carcinoma', min_other_count=2, return_primary_only=False)

In [None]:
rc_add_pat = rc_add['PATIENT_IDENTIFIER'].unique()
rc_add_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': rc_add_pat})
rc_add_pat_df

**Filter 6: Colon Cancer and additional case of Lynch Syndrome-associated cancer in the same person (includin 2 crc)**

In [None]:
crc_complex = filter_crc_complex(lynch_categoried_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', date_col='ENCOUNTER_DATE')

In [None]:
crc_complex_pat = crc_complex['PATIENT_IDENTIFIER'].unique()
crc_complex_pat_df = pd.DataFrame({'PATIENT_IDENTIFIER': crc_complex_pat})
crc_complex_pat_df

**END**

**TESTING FUNCTION USING DUMMY DATA**

In [None]:
dummy_data = [
    
    #Patient 1: CRC Twice < 6 months -> should not qualify
    {'PATIENT_IDENTIFIER': '1', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-01-01'},
    {'PATIENT_IDENTIFIER': '1', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-02-01'},
    
    #Patient 2: CRC Twice > 6 months -> Should quality 
    {'PATIENT_IDENTIFIER': '2', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-03-01'},
    {'PATIENT_IDENTIFIER': '2', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-10-01'},
    
    #Patient 3: CRC once + another lynch type -> Should qualify
    {'PATIENT_IDENTIFIER': '3', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-02-01'},
    {'PATIENT_IDENTIFIER': '3', 'Lynch_Type': 'Endometrial Cancer', 'ENCOUNTER_DATE': '2025-04-05'},
    
    #Patient 4: Not Crc -> Should not qualify
    {'PATIENT_IDENTIFIER': '4', 'Lynch_Type': 'Gastric Cancer', 'ENCOUNTER_DATE': '2025-02-01'},
    
    
    #Patient 5: CRC Twice > 6 months -> Should quality 
    {'PATIENT_IDENTIFIER': '5', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-02-01'},
    {'PATIENT_IDENTIFIER': '5', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-12-05'},
    
    #Patient 6: CRC Twice > 6 months -> Should quality 
    {'PATIENT_IDENTIFIER': '6', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-02-01'},
    {'PATIENT_IDENTIFIER': '6', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-12-05'},
    {'PATIENT_IDENTIFIER': '6', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2023-12-05'},
    {'PATIENT_IDENTIFIER': '6', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2021-12-05'},
    {'PATIENT_IDENTIFIER': '6', 'Lynch_Type': 'Colon Cancer', 'ENCOUNTER_DATE': '2025-01-05'},
    
]

dummy_df = pd.DataFrame(dummy_data)

In [None]:
ts = filter_crc_complex(dummy_df, id_col='PATIENT_IDENTIFIER', category_col='Lynch_Type', date_col='ENCOUNTER_DATE')

In [None]:
# Flagged patient keys
lynch_flagged_df = pd.concat([ec_s_pat_df, ov_s_pat_df, gc_s_pat_df, pc_add_pat_df, rc_add_pat_df, crc_complex_pat_df], ignore_index=True)

In [None]:
lynch_flagged_df.to_csv('lynch_flagged_df.csv')