In [10]:
import pandas as pd

In [11]:
def load_data():
    try:
        # Load datasets
        sleep_data = pd.read_csv('../Dataset/Aggregated_Sleep.csv')
        physiology_data = pd.read_csv('../Dataset/Physiology_Formatted_Dataset_v2.csv')
    except Exception as e:
        print("Error loading data:", e)
        return None, None
    return sleep_data, physiology_data

def preprocess_data(sleep_data, physiology_data):
    # Convert date columns to datetime
    sleep_data['window_start'] = pd.to_datetime(sleep_data['window_start'])
    physiology_data['Window'] = pd.to_datetime(physiology_data['Window'])

    # Extract date components
    sleep_data['sleep_date'] = sleep_data['window_start'].dt.date
    physiology_data['physio_date'] = physiology_data['Window'].dt.date

    return sleep_data, physiology_data

def map_sleep_data(evening_sleep_mapping, nap_sleep_mapping, patient_id, physio_date, physio_time):
    # Mapping evening sleep data to the next day
    previous_day = physio_date - pd.Timedelta(days=1)
    if (patient_id, previous_day) in evening_sleep_mapping.index:
        return evening_sleep_mapping.loc[(patient_id, previous_day)]

    # Mapping nap sleep data to the same day's evening data
    if (patient_id, physio_date) in nap_sleep_mapping.index and physio_time.hour >= 18:
        return nap_sleep_mapping.loc[(patient_id, physio_date)]

    return None

def aggregate_data(sleep_data, physiology_data):
    # Prepare mappings
    evening_sleep = sleep_data[sleep_data['window_start'].dt.hour == 18]
    evening_sleep_mapping = evening_sleep.set_index(['patient_id', 'sleep_date'])
    nap_sleep = sleep_data[sleep_data['window_start'].dt.hour == 12]
    nap_sleep_mapping = nap_sleep.set_index(['patient_id', 'sleep_date'])

    final_data = []
    for index, physio_row in physiology_data.iterrows():
        sleep_row = map_sleep_data(evening_sleep_mapping, nap_sleep_mapping, 
                                   physio_row['patient_id'], physio_row['physio_date'], physio_row['Window'])
        if sleep_row is not None:
            window_period = f"{sleep_row.name[1]} sleep - {physio_row['day']} physio day"
            merged_row = {
                'patient_id': physio_row['patient_id'],
                'window_period': window_period,
                **{f"{col}_sleep": sleep_row[col] for col in sleep_data.columns if col not in ['patient_id', 'window_start', 'sleep_date']},
                **{col: physio_row[col] for col in physiology_data.columns if col not in ['patient_id', 'Window', 'day', 'physio_date']},
                'Sleep_Agitation': sleep_row['agitation'],
                'Physio_Agitation': physio_row['Agitation'],
                'Agitation': 1 if ((sleep_row['agitation'] == 1 and physio_row['Agitation'] == 0) or
                                   (sleep_row['agitation'] == 0 and physio_row['Agitation'] == 1)) else 0
            }
            final_data.append(merged_row)

    return pd.DataFrame(final_data)

def main():
    sleep_data, physiology_data = load_data()
    if sleep_data is None or physiology_data is None:
        return
    
    sleep_data, physiology_data = preprocess_data(sleep_data, physiology_data)
    final_df = aggregate_data(sleep_data, physiology_data)

    # Save and print results
    final_df.to_csv('../Dataset/dataset_final.csv', index=False)
    print("Final dataset preview:", final_df.head())
    print("Shape of the final dataset:", final_df.shape)
    print("Agitation counts:", final_df['Agitation'].value_counts())
    print("Unique patients:", final_df['patient_id'].nunique())

In [12]:
if __name__ == "__main__":
    main()

Final dataset preview:   patient_id                             window_period  mean_HR_sleep  \
0      1fbe4  2019-04-24 sleep - 2019-04-25 physio day      51.430403   
1      1fbe4  2019-04-24 sleep - 2019-04-25 physio day      51.430403   
2      1fbe4  2019-04-25 sleep - 2019-04-26 physio day      53.034420   
3      1fbe4  2019-04-25 sleep - 2019-04-26 physio day      53.034420   
4      1fbe4  2019-04-26 sleep - 2019-04-27 physio day      55.426370   

   HR_var_sleep  mean_RR_sleep  RR_var_sleep  WASO_sleep  SOL_sleep  \
0      3.981576      13.641026      1.406173        76.0        8.0   
1      3.981576      13.641026      1.406173        76.0        8.0   
2      5.088749      12.722826      1.384771        26.0       32.0   
3      5.088749      12.722826      1.384771        26.0       32.0   
4      6.080765      15.092466      1.834960        67.0        9.0   

   TIB_sleep  TST_sleep  ...  snoring_counts_sleep  agitation_sleep  \
0      564.0      443.0  ...            