In [None]:
import pandas as pd
import numpy as np

patient = pd.read_csv("Z:\Ferretin Project\Ferritin Project\Patient.csv")
lab = pd.read_csv("Z:\Ferretin Project\Ferritin Project\Lab.csv")
lab_patient_merged = pd.merge(patient, lab, on="Patient_ID", how='inner')
lab_patient_merged['Site_ID'] = lab_patient_merged['Site_ID_x'].combine_first(lab_patient_merged['Site_ID_y'])
lab_patient_merged.drop(['Site_ID_x', 'Site_ID_y'], axis=1, inplace=True)

lab_patient_merged['DateCreated'] = pd.to_datetime(lab_patient_merged['DateCreated'])
lab_patient_merged['YearCreated'] = lab_patient_merged['DateCreated'].dt.year

mask = ~lab_patient_merged[['DateCreated', 'BirthYear']].isnull().any(axis=1)
lab_patient_merged['Age'] = np.nan
lab_patient_merged.loc[mask, 'Age'] = (
    (lab_patient_merged.loc[mask, 'DateCreated'] -
     pd.to_datetime(lab_patient_merged.loc[mask, 'BirthYear'].astype(str) + '-06-01'))
    .dt.days // 365
)

# Filter data for age >= 6
lab_patient_merged = lab_patient_merged[lab_patient_merged['Age'] >= 6]

In [None]:
# Generating Hemoglobin Dataframe
Hgb = lab_patient_merged[lab_patient_merged['Code_calc'] == "718-7"]
Hgb_short = Hgb[['Patient_ID', 'Sex', 'Site_ID', 'TestResult_calc', 'YearCreated', 'Age']].copy()
Hgb_short['TestResult_calc'] = pd.to_numeric(Hgb_short['TestResult_calc'], errors='coerce')
print ("# of patients in Hgb dataframe:", Hgb_short['Patient_ID'].nunique())

# WHO anemia classifications. Note: sex distinction  15 and above.
conditions = [
    (Hgb_short['Age'] >= 6) & (Hgb_short['Age'] <= 11) & (Hgb_short['TestResult_calc'] < 11.5),
    (Hgb_short['Age'] >= 12) & (Hgb_short['Age'] <= 14) & (Hgb_short['TestResult_calc'] < 12),
    (Hgb_short['Age'] >= 15) & (Hgb_short['Sex'] == 'Female') & (Hgb_short['TestResult_calc'] < 12),
    (Hgb_short['Age'] >= 15) & (Hgb_short['Sex'] == 'Male') & (Hgb_short['TestResult_calc'] < 13)
]

# Defining the choices: if the condition is true, set anemia to 1 (Yes), otherwise 0 (No).
choices = [1, 1, 1, 1]
Hgb_short['anemia'] = np.select(conditions, choices, default=0)

In [None]:
# Sorting Hgb dataframe so that rows with anemia (1) come on top within each year
Hgb_short_sorted = Hgb_short.sort_values(by=['YearCreated', 'anemia'], ascending=[True, False])
# Drop duplicates to keep each unique patient represented: if anemic at least once that year, then count as anemic 
Hgb_short_unique = Hgb_short_sorted.drop_duplicates(subset=['YearCreated', 'Patient_ID'])

# Anemia Prevalence by year 
anemia_by_year = Hgb_short_unique.groupby('YearCreated')['anemia'].mean()
print(anemia_by_year)

In [None]:
anemia_by_year_by_sex = Hgb_short_unique.groupby(['YearCreated', 'Sex'])['anemia'].mean()
anemia_by_year_by_sex.to_csv('anemia_by_year_by_sex.csv')

In [None]:
# Generating Ferritin Dataframe
ferritin_lab = lab_patient_merged[lab_patient_merged['Code_calc'] == "2276-4"]
ferritin_lab_short = ferritin_lab[['Patient_ID', 'Sex', 'Site_ID', 'TestResult_calc', 'YearCreated', 'Age']].copy()
ferritin_lab_short['TestResult_calc'] = pd.to_numeric(ferritin_lab_short['TestResult_calc'], errors='coerce')

# Iron deficiency
ferritin_lab_short['iron_deficiency'] = ferritin_lab_short.groupby(['YearCreated', 'Patient_ID'])['TestResult_calc']\
    .transform(lambda x: (x < 15).any()).astype(int)

In [None]:
print("# of patients in overall data:", lab_patient_merged['Patient_ID'].nunique())
print("# of patients in hemoglobin dataframe:", Hgb_short['Patient_ID'].nunique())
print("# of patients in ferritin dataframe:", ferritin_lab_short['Patient_ID'].nunique())

# Find the overlap of individuals with both ferritin and hemoglobin data
overlap = ferritin_lab_short.merge(Hgb_short, on='Patient_ID', how='inner')
overlap_count = overlap['Patient_ID'].nunique()
percentage_overlap = (overlap_count / ferritin_lab_short['Patient_ID'].nunique()) * 100

print("% of patients in both ferritin and anemia dataframe:", percentage_overlap)
print("# of patients with both ferritin and hemoglobin dataframes:", overlap_count)


In [None]:
# Sort the dataframe so that rows with iron deficiency (1) come on top within each year. Dropping the duplicate rows to keep each unique patient only once
ferritin_lab_short_sorted = ferritin_lab_short.sort_values(by=['YearCreated', 'iron_deficiency'], ascending=[True, False])
ferritin_lab_short_unique = ferritin_lab_short_sorted.drop_duplicates(subset=['YearCreated', 'Patient_ID'])

iron_deficiency_by_year = ferritin_lab_short_unique.groupby('YearCreated')['iron_deficiency'].mean()
print(iron_deficiency_by_year)

In [None]:
iron_deficiency_by_year = ferritin_lab_short_unique.groupby(['YearCreated', 'Sex'])['iron_deficiency'].mean()
print(iron_deficiency_by_year)
iron_deficiency_by_year.to_csv('iron_deficiency_by_year.csv')

In [None]:
import matplotlib.pyplot as plt

anemia_by_year_sex = Hgb_short_unique.groupby(['YearCreated', 'Sex'])['anemia'].mean()
iron_deficiency_by_year_sex = ferritin_lab_short_unique.groupby(['YearCreated', 'Sex'])['iron_deficiency'].mean()

# Figure: line plot. 
plt.figure(figsize=(13, 9))
markers = {'Male': {'anemia': 'D', 'iron_deficiency': 'o', 'linestyle': ':'},  
           'Female': {'anemia': 'D', 'iron_deficiency': 'o', 'linestyle': '-'}} 
colors = {'anemia': 'black', 'iron_deficiency': 'green'}  
sizes = {'anemia': 10, 'iron_deficiency': 10} 

for sex in ['Male', 'Female']:
    anemia_by_year_sex.xs(sex, level='Sex').plot(marker=markers[sex]['anemia'], color=colors['anemia'], 
                                                 linestyle=markers[sex]['linestyle'], markersize=sizes['anemia'], 
                                                 linewidth=2, label=f'Anemia ({sex})')

    iron_deficiency_by_year_sex.xs(sex, level='Sex').plot(marker=markers[sex]['iron_deficiency'], 
                                                          color=colors['iron_deficiency'], 
                                                          linestyle=markers[sex]['linestyle'], 
                                                          markersize=sizes['iron_deficiency'], 
                                                          linewidth=2, label=f'Iron Deficiency ({sex})')

plt.xlabel('Year', fontsize=26)
plt.ylabel('Prevalence', fontsize=26, labelpad=15) 
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
plt.grid(True)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), fontsize=20, ncol=2)
plt.subplots_adjust(bottom=0.2)
plt.tight_layout()
plt.show()

plt.savefig('anemia_iron_deficiency_plot.tiff', dpi=1200, transparent=True)



In [None]:
# Tally of subgroups of ID, anemia, both. 
Both_tests = pd.merge(Hgb_short_unique, ferritin_lab_short_unique, on=['Patient_ID', 'YearCreated', 'Site_ID', 'Sex', 'Age'], suffixes=('_Hgb', '_Ferritin'), how='outer')
Both_tests['Ever_Anemia']=Both_tests.groupby('Patient_ID')['anemia'].transform('max')*10
Both_tests['Ever_Iron_Deficiency']=Both_tests.groupby('Patient_ID')['iron_deficiency'].transform('max')
unique_patients_max_score = Both_tests.drop_duplicates(subset=['Patient_ID'], keep='first')
unique_patients_max_score['Total_Score_Ever'] = unique_patients_max_score['Ever_Anemia'] + unique_patients_max_score['Ever_Iron_Deficiency']

# Count the unique number of patients for each 'Total_Score_Ever' value
tally_categories_ever = unique_patients_max_score.groupby('Total_Score_Ever')['Patient_ID'].nunique().reset_index()
tally_categories_ever.columns = ['Total_Score_Ever', 'Unique_Patients_Count']
category_mapping = {0: "None", 1: "Only Iron Deficiency", 10: "Only Anemia", 11: "Both Iron Deficiency and Anemia"}
tally_categories_ever['Category'] = tally_categories_ever['Total_Score_Ever'].map(category_mapping)
print(tally_categories_ever)

#This tally adds up to 94,017 and only includes patients who have both of at least one ferritin and hemoglobin data recorded. 

In [None]:
def categorize_age_group(age):
    if 5 <= age <= 13:
        return '5-13'
    elif 14 <= age <= 18:
        return '14-18'
    elif 19 <= age <= 35:
        return '19-35'
    elif 36 <= age <= 50:
        return '36-50'
    elif 51 <= age <= 79:
        return '51-79'
    else:
        return 'over 79'

ferritin_lab_short_unique['AgeGroup'] = ferritin_lab_short_unique['Age'].apply(categorize_age_group)
female_data = ferritin_lab_short_unique[ferritin_lab_short_unique['Sex'] == 'Female']
ID_by_year_age_female = female_data.groupby(['YearCreated', 'AgeGroup'])['iron_deficiency'].mean().unstack()
    
from cycler import cycler

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#999999',]

custom_cycler = (cycler(color=CB_color_cycle)  
                 + cycler(marker=['o', 's', '^', 'd', '*', 'p'])  
                 + cycler(linestyle=['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (3, 5, 1, 5, 1, 5))]))

plt.figure(figsize=(13, 9))
ax = plt.gca()
ax.set_prop_cycle(custom_cycler)

age_groups = ID_by_year_age_female.columns
for age_group in age_groups:
    plt.plot(
        ID_by_year_age_female.index,
        ID_by_year_age_female[age_group],
        label=f'Age {age_group}',
        linewidth=2,
        markersize=10
    )

plt.xlabel('Year', fontsize=26)
plt.ylabel('ID Prevalence', fontsize=26, labelpad=15)
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
plt.grid(True)
plt.legend(title='Age Group', title_fontsize=20, fontsize=20, loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=2)
plt.subplots_adjust(bottom=0.25)
plt.tight_layout()
plt.savefig('ID_prevalence_age_groups.tiff', dpi=1200, transparent=True)
plt.show()
