In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyreadstat
from datetime import datetime, timedelta
import json
import csv
import glob

In [None]:
# save the latest survey data as a dataframe and a csv file
survey_date = '30-Jan-2023'
df_survey, meta = pyreadstat.read_sav('/proj/sens2021503/mom2b/Survey_summary/Mom2B downloaded on 30-Jan-2023.sav')
df_survey = pd.DataFrame(df_survey)

df_survey = df_survey[['patientId', 'BP_date', 'pp6_13_EPDS_R']]
df_survey.to_csv('survey.csv', index=False)

# Exclude rows with NaN values in 'BP_date' and 'pp6_13_EPDS_R'
df_survey = df_survey.dropna(subset=['BP_date', 'pp6_13_EPDS_R'])

df_survey.head()


In [None]:
df_survey.to_csv('survey.csv', index=False)

In [None]:
#collect all accelerometer filenames for the patients in our survey dataframe in one csv
df = pd.DataFrame(columns = ['id', 'filename'])
for list_id in df_survey['patientId']:
    filenames = glob.glob("/proj/sens2021503/mom2b/decrypted-data/1njtfXoAe9nkpTD6Q1wvxttT/{}/accel_*.csv".format(list_id))    
    if len(filenames)>0:
        for filename in filenames:
            d = {'id': [list_id], 'filename': [filename],}
            df_temp = pd.DataFrame(data=d)
            df = pd.concat([df, df_temp], ignore_index=True)
            
df.to_csv('final.csv', index=False)
print('Done')


In [None]:
print(df.head())
df_survey.head()

In [None]:
#count the number of unique id values
unique_participant_ids = df['id'].nunique()

print("Number of unique participant_ids:", unique_participant_ids)
df.shape

In [None]:
# Extract year and month from the filename
df['file_month'] = df['filename'].str.extract(r'accel_(\d{4}-\d{2})\.csv')

merged_df = pd.merge(df, df_survey, left_on='id', right_on='patientId')

# Convert 'file_month' and 'BP_date' to datetime
merged_df['file_month'] = pd.to_datetime(merged_df['file_month'])
merged_df['BP_date'] = pd.to_datetime(merged_df['BP_date'])

# Filter rows where 'file_month' is before 'BP_date'
filtered_df = merged_df[merged_df['file_month'].dt.to_period('M') <= merged_df['BP_date'].dt.to_period('M')]

# Keep only 'id' and 'filenames' columns
df = filtered_df[['id', 'filename']]
df.shape

In [None]:
df.to_csv('final.csv', index=False)

In [None]:
#count the number of unique id values
unique_participant_ids = df['id'].nunique()

print("Number of unique participant_ids:", unique_participant_ids)

In [None]:
df_survey = df_survey[df_survey['patientId'].isin(df['id'])]
print(df_survey.shape)
df_survey.head()

In [None]:
# Create a new column 'depression_flag' based on the condition
df_survey['depression_flag'] = df_survey['pp6_13_EPDS_R'] >= 12
df_survey.to_csv('survey.csv', index=False)

df_survey.head()

In [None]:
labels = ['No Depression', 'Depression']
sizes = df_survey['depression_flag'].value_counts()
total_count = len(df_survey)

plt.pie(sizes, labels=labels, autopct=lambda p: '{:.1f}%\n({:.0f})'.format(p, p * total_count / 100),
        startangle=90, colors=['lightcoral', 'lightblue'])
plt.title('Distribution of Depression Flags')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()