## Data in Numbers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

patients_eys = pd.read_csv("diagnosis_diseaseOnset_eye_gender_race_ethnicity_dob.csv")

unique_patients = patients_eys[['patient_id']].drop_duplicates().shape[0]
print(f"Number of unique patients: {unique_patients}")

unique_eyes = patients_eys[['patient_id', 'location']].drop_duplicates().shape[0]
print(f"Number of unique eyes: {unique_eyes}")

had_dry_amd = patients_eys.loc[patients_eys['name'] == 'dry_amd'][['patient_id', 'location']].drop_duplicates()
had_ga = patients_eys.loc[patients_eys['name'] == 'ga'][['patient_id', 'location']].drop_duplicates()
had_wet_amd = patients_eys.loc[patients_eys['name'] == 'wet_amd'][['patient_id', 'location']].drop_duplicates()

dry_to_ga = had_dry_amd.merge(had_ga, on=['patient_id', 'location'], how='inner')
ga_to_wet = had_ga.merge(had_wet_amd, on=['patient_id', 'location'], how='inner')

print(f"dry: {had_dry_amd.shape[0]}")
print(f"ga: {had_ga.shape[0]}")
print(f"wet: {had_wet_amd.shape[0]}")
print(f"dry to ga : {dry_to_ga.shape[0]}")
print(f"ga to wet: {ga_to_wet.shape[0]}")



# Merge had_dry_amd and had_wet_amd to find patients with both conditions
had_dry_wet_amd = had_dry_amd.merge(had_wet_amd, on=['patient_id', 'location'], how='inner')

# Exclude those who also had GA
had_dry_wet_no_ga = had_dry_wet_amd.merge(had_ga, on=['patient_id', 'location'], how='left', indicator=True)
had_dry_wet_no_ga = had_dry_wet_no_ga[had_dry_wet_no_ga['_merge'] == 'left_only'].drop(columns=['_merge'])

print(f"from dry (no ga) to wet: {len(had_dry_wet_no_ga)}")

# Find patients who had Wet AMD but not Dry AMD or GA
only_wet_amd = had_wet_amd.merge(had_dry_amd, on=['patient_id', 'location'], how='left', indicator=True)
only_wet_amd = only_wet_amd[only_wet_amd['_merge'] == 'left_only'].drop(columns=['_merge'])

only_wet_amd = only_wet_amd.merge(had_ga, on=['patient_id', 'location'], how='left', indicator=True)
only_wet_amd = only_wet_amd[only_wet_amd['_merge'] == 'left_only'].drop(columns=['_merge'])

print(f"only had wet: {len(only_wet_amd)}")

had_ga_no_wet = had_ga.merge(had_wet_amd, on=['patient_id', 'location'], how='left', indicator=True)
had_ga_no_wet = had_ga_no_wet[had_ga_no_wet['_merge'] == 'left_only'].drop(columns=['_merge'])

print(f"had ga no wet: {len(had_ga_no_wet)}")


## mean visit distance

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("acuity.csv")

# Convert visit_date to datetime
df['date'] = pd.to_datetime(df['date'])
df = df[['patient_id', 'date']].drop_duplicates()

df = df.sort_values(by=['patient_id', 'date'])

df['visit_gap'] = df.groupby('patient_id')['date'].diff().dt.days

mean_gaps_per_patient = df.groupby('patient_id')['visit_gap'].mean()

overall_mean_gap = mean_gaps_per_patient.mean()

print(f"Overall Mean of Visit Gaps: {overall_mean_gap // 30:.2f}  months")


## Cox Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter


In [None]:
import pandas as pd
import json
import glob
import os

# Load CSV
metadata_df = pd.read_csv("merged_output_full_outer.csv")  # Ensure correct file path

# Convert visit_date to datetime for easier calculations
metadata_df["date"] = pd.to_datetime(metadata_df["date"])

# Show first few rows
print(metadata_df.head())


In [None]:
import pandas as pd
import json

# Load JSON file
json_path = "htd_analysis_dr_kent.json"
with open(json_path, "r") as f:
    data = json.load(f)

# Flatten JSON: Convert nested structure into DataFrame format
biomarker_records = []
for visit_path, biomarkers in data.items():
    record = {
        "path": visit_path,  # Store path to link with CSV
        "htd_6mm_area": biomarkers.get("htd_6mm_area", None),
        "htd_3mm_area": biomarkers.get("htd_3mm_area", None),
        "htd_1mm_area": biomarkers.get("htd_1mm_area", None),
        "inner_retina_thickness": biomarkers["thicknesses"]["Inner Retina"].get("central", None),
        "outer_retina_thickness": biomarkers["thicknesses"]["Outer Retina"].get("central", None),
        "rpedc_thickness": biomarkers["thicknesses"]["RPEDC"].get("central", None),
        "total_thickness": biomarkers["thicknesses"]["total_thickness"].get("central", None),
        "rnfl_thickness": biomarkers["thicknesses"]["rnfl_thickness"].get("central", None),
        "ez_rpe_thickness": biomarkers["thicknesses"]["ez-rpe"].get("central", None),
    }
    biomarker_records.append(record)

# Convert to DataFrame
biomarker_df = pd.DataFrame(biomarker_records)

# Show first few rows
print(biomarker_df.head())


In [None]:
# Load CSV file
metadata_df = pd.read_csv("merged_output_full_outer.csv")  # Ensure correct path

# Convert visit_date to datetime
metadata_df["date"] = pd.to_datetime(metadata_df["date"])

# Show metadata
print(metadata_df.head())


In [None]:
# Merge biomarker data with metadata using the 'path' column
merged_df = metadata_df.merge(biomarker_df, on="path")

merged_df.drop(columns=["path"], inplace=True)

# Show first few rows
print(len(merged_df))


In [None]:
# Identify the first Dry AMD diagnosis per patient & eye
dry_amd_diagnosis = merged_df.groupby(["patient_id", "eye"])["visit_date"].min().reset_index()
dry_amd_diagnosis.rename(columns={"visit_date": "diagnosis_date"}, inplace=True)

# Merge with main DataFrame
merged_df = merged_df.merge(dry_amd_diagnosis, on=["patient_id", "eye"])

# Compute time since Dry AMD diagnosis (in months)
merged_df["time_to_event"] = (merged_df["visit_date"] - merged_df["diagnosis_date"]).dt.days / 30.0

# Label GA occurrence (Assumption: GA occurs if ez_rpe_thickness < 10 µm)
merged_df["event"] = merged_df["ez_rpe_thickness"].apply(lambda x: 1 if x < 10 else 0)

# Show relevant columns
print(merged_df[["patient_id", "eye", "time_to_event", "event", "htd_6mm_area", "ez_rpe_thickness"]].head())
