In [None]:
import csv
from collections import defaultdict

def read_csv_group_by_patient(csv_file):
    grouped_data = defaultdict(list)  # Dictionary to store grouped data

    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        reader = csv.DictReader(f)  # Reads CSV into dictionaries

        for row in reader:
            patient_id = row.get("patient_id")  # Get patient_id column
            if patient_id:  # Ensure patient_id exists
                grouped_data[patient_id].append(row)

    return grouped_data  # Dictionary with patient_id as keys and lists of records as values

# Example Usage
csv_file_path = "acuity.csv"  # Change to your actual file path
grouped_patients = read_csv_group_by_patient(csv_file_path)

from datetime import datetime


def parse_date(date_str, parse_format):
    """Converts a string date into a datetime object for correct sorting."""
    try:
        return datetime.strptime(date_str, parse_format)
    except ValueError:
        return datetime.min  # Assign the earliest date for invalid/missing values
# Print grouped data
# Print only first 5 patient groups
for i, (patient_id, records) in enumerate(grouped_patients.items()):
    if i >= 5:  # Limit output
        break
    print(f"Patient ID: {patient_id}")
    filtered_records = [rec for rec in records if rec['eye'] == 'OS']
    sorted_records = sorted(filtered_records, key=lambda x: parse_date(x['date'], "%m/%d/%Y"))
    for record in sorted_records:  # Limit to first 3 records per patient
        print(record)
    break


In [None]:
# Example Usage
import pandas as pd

csv_file_path = "diagnosis_diseaseOnset_eye_gender_race_ethnicity_dob.csv"  # Change to your actual file path

on_sets = pd.read_csv(csv_file_path)


In [None]:
on_sets.sort_values('on_set_date', inplace=True)

In [None]:
on_sets

In [None]:
on_sets['on_set_date'] = pd.to_datetime(on_sets['on_set_date'])

# Define the starting date
start_date = pd.Timestamp('2010-03-19')

# Calculate the quarter number since the start date
on_sets['quarter'] = ((on_sets['on_set_date'] - start_date).dt.days // 90) + 1  # Each quarter is ~90 days

# Count the number of rows in each quarter
quarter_counts = on_sets.groupby('quarter').size().reset_index(name='count')



In [None]:
quarter_counts.sort_values('quarter', ascending=True, inplace=True)

In [None]:
quarter_counts = quarter_counts[quarter_counts['quarter'] > 11]  # Keeps only rows where vision is NOT '20/100'


In [None]:
quarter_counts

In [None]:
std = quarter_counts['count'].std()

In [None]:
mean = quarter_counts['count'].mean()

In [None]:
std, mean

In [None]:
import pandas as pd

# Load both CSV files
csv_file_1 = "path2date.csv"
csv_file_2 = "id_eye_2_path.csv"

df1 = pd.read_csv(csv_file_1)
df2 = pd.read_csv(csv_file_2)

# Perform FULL OUTER JOIN on 'path' column
df_merged = pd.merge(df1, df2, on="path", how="outer")  # Keeps all rows from both files

# Save the merged DataFrame to a new CSV file
df_merged.to_csv("merged_output_full_outer.csv", index=False)



In [None]:
merged_df = pd.read_csv("merged_output_full_outer.csv")

In [None]:
merged_df

In [None]:
pdd = pd.read_csv("path2date.csv")

In [None]:
len(pdd)