# NOTES

1. Dropping `minute_restless` from `fitbit_sleep_daily_summary.csv` as it is about 60% empty.
2. Dropping `survey_version_concept_id` and `survey_version_name` from `survey.csv` as they are fully empty.

In [1]:
# Imports

import pandas as pd
import numpy as np
import os
import dask.dataframe as dd

from datetime import date, datetime, timedelta

In [2]:
dataset = os.getenv('WORKSPACE_CDR')
my_bucket = os.getenv('WORKSPACE_BUCKET')

In [3]:
def report_missingness(df: pd.DataFrame):
    missing_percentage = df.isnull().mean() * 100

    # Create a DataFrame to display the results
    missing_info = pd.DataFrame({
        'Column': missing_percentage.index,
        'Missing Percentage': missing_percentage.values
    })

    # Print the result
    return missing_info

In [4]:
%%time

# Reading data

fitbit_intraday_steps_df = pd.read_csv("fitbit_intraday_steps.csv")
fitbit_sleep_daily_summary_df = pd.read_csv("fitbit_sleep_daily_summary.csv")
survey_df = pd.read_csv("survey.csv")

print("Loaded data successfully...")

Loaded data successfully...
CPU times: user 16.8 s, sys: 3.81 s, total: 20.6 s
Wall time: 22.2 s


In [5]:
print(f"Intraday steps features = {fitbit_intraday_steps_df.columns}")
print(f"Sleep daily features = {fitbit_sleep_daily_summary_df.columns}")
print(f"Survey features = {survey_df.columns}")

Intraday steps features = Index(['person_id', 'date', 'sum_steps'], dtype='object')
Sleep daily features = Index(['person_id', 'sleep_date', 'is_main_sleep', 'minute_in_bed',
       'minute_asleep', 'minute_after_wakeup', 'minute_awake',
       'minute_restless', 'minute_deep', 'minute_light', 'minute_rem',
       'minute_wake'],
      dtype='object')
Survey features = Index(['person_id', 'survey_datetime', 'survey', 'question_concept_id',
       'question', 'answer_concept_id', 'answer', 'survey_version_concept_id',
       'survey_version_name'],
      dtype='object')


In [6]:
for df in [fitbit_intraday_steps_df, fitbit_sleep_daily_summary_df, survey_df]:
    print(report_missingness(df))
    print("-------------------")

      Column  Missing Percentage
0  person_id                 0.0
1       date                 0.0
2  sum_steps                 0.0
-------------------
                 Column  Missing Percentage
0             person_id            0.000000
1            sleep_date            0.000000
2         is_main_sleep            0.000000
3         minute_in_bed            0.000000
4         minute_asleep            0.000000
5   minute_after_wakeup            0.000000
6          minute_awake            0.000000
7       minute_restless           58.037069
8           minute_deep           41.962931
9          minute_light           41.962931
10           minute_rem           41.962931
11          minute_wake           41.962931
-------------------
                      Column  Missing Percentage
0                  person_id            0.000000
1            survey_datetime            0.000000
2                     survey            0.000000
3        question_concept_id            0.000000
4          

In [7]:
# Dropping highly missing columns

print(f"Dropping the columns minute_restless, survey_version_concept_id, survey_version_name...")

fitbit_sleep_daily_summary_df = fitbit_sleep_daily_summary_df.drop(columns=["minute_restless"])
survey_df = survey_df.drop(columns=["survey_version_concept_id", "survey_version_name"])

Dropping the columns minute_restless, survey_version_concept_id, survey_version_name...


In [8]:
# Convert the respective date/datetime columns to date objects for each dataframe

fitbit_intraday_steps_df["date"] = pd.to_datetime(fitbit_intraday_steps_df["date"])
fitbit_intraday_steps_df["date"] = fitbit_intraday_steps_df["date"].dt.date

fitbit_sleep_daily_summary_df["sleep_date"] = pd.to_datetime(fitbit_sleep_daily_summary_df["sleep_date"])
fitbit_sleep_daily_summary_df["sleep_date"] = fitbit_sleep_daily_summary_df["sleep_date"].dt.date

"""
Survey datetime looks like this. For example, 2017-12-11 17:36:06+00:00.
Cleaning it up to extract only the date
"""

survey_df["survey_datetime"] = pd.to_datetime(survey_df["survey_datetime"])
survey_df["survey_date"] = survey_df["survey_datetime"].dt.date
survey_df = survey_df.drop(columns=["survey_datetime"])

print("Completed datetime conversions...")

Completed datetime conversions...


In [9]:
%%time

# Creating and populating a hash table that maps every patient ID with their respective dates (range of years 
# up to one year before the date of their survey)

patient_dates = dict()

unique_survey_pids = survey_df.person_id.unique()
for pid in unique_survey_pids:
    date = survey_df[survey_df.person_id == pid]["survey_date"].values[0]
    prev_date = date - timedelta(days=365*2)
    date_range = pd.date_range(start=prev_date, end=date)
    date_range = date_range.tolist()
    date_range = [x.date() for x in date_range]
    patient_dates[pid] = date_range
    
print("Completed populating the data structure...")
    
# print(dt.strptime("2017-12-11", "%Y-%m-%d") in date_range.tolist())

Completed populating the data structure...
CPU times: user 58.2 s, sys: 321 ms, total: 58.5 s
Wall time: 58.5 s


## Fitbit Intraday Steps

In [10]:
%%time

filtered_steps_data_list = []

def func(group, date_column_name="date"):
    pid = group["person_id"].iloc[0]
    required_dates = patient_dates.get(pid)
    filtered_group = group[group[date_column_name].isin(required_dates)]
    filtered_group = filtered_group.sort_values(date_column_name)
    filtered_group = filtered_group.reset_index(drop=True)
    filtered_steps_data_list.append(filtered_group)
    return None

_ = fitbit_intraday_steps_df.groupby("person_id").apply(func)

CPU times: user 1min 7s, sys: 528 ms, total: 1min 7s
Wall time: 1min 7s


In [11]:
%%time

filtered_steps_data = pd.concat(filtered_steps_data_list, ignore_index=True)

CPU times: user 5.21 s, sys: 53.9 ms, total: 5.26 s
Wall time: 5.26 s


In [12]:
filtered_steps_data.person_id.nunique()

12011

## Fitbit Sleep Summary

In [13]:
%%time

filtered_sleep_data_list = []

def func(group, date_column_name="sleep_date"):
    pid = group["person_id"].iloc[0]
    required_dates = patient_dates.get(pid)
    filtered_group = group[group[date_column_name].isin(required_dates)]
    filtered_group = filtered_group.sort_values(date_column_name)
    filtered_group = filtered_group.reset_index(drop=True)
    filtered_sleep_data_list.append(filtered_group)
    return None

_ = fitbit_sleep_daily_summary_df.groupby("person_id").apply(func)

CPU times: user 1min 7s, sys: 1.03 s, total: 1min 8s
Wall time: 1min 8s


In [14]:
%%time

filtered_sleep_data = pd.concat(filtered_sleep_data_list, ignore_index=True)

CPU times: user 6.34 s, sys: 165 ms, total: 6.51 s
Wall time: 6.5 s


In [15]:
filtered_sleep_data.person_id.nunique()

11591

# Merging

In [16]:
new_survey_df = survey_df[survey_df.question == "Overall Health: General Health"]

In [17]:
%%time

"""
Now that we have all 3 dataframes i.e., survey data, filtered steps data, and filtered sleep data, 
we will merge these based on the person IDs and dates using the following strategy.
"""

intermediate_steps_sleep_df = pd.merge(filtered_steps_data, filtered_sleep_data, 
                                       how="inner", left_on = ["person_id", "date"], right_on = ["person_id", "sleep_date"])
final_df = pd.merge(intermediate_steps_sleep_df, new_survey_df, how="left", on="person_id")

CPU times: user 9.53 s, sys: 1.97 s, total: 11.5 s
Wall time: 11.6 s


In [18]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4226814 entries, 0 to 4226813
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   person_id            int64  
 1   date                 object 
 2   sum_steps            int64  
 3   sleep_date           object 
 4   is_main_sleep        bool   
 5   minute_in_bed        int64  
 6   minute_asleep        int64  
 7   minute_after_wakeup  int64  
 8   minute_awake         int64  
 9   minute_deep          float64
 10  minute_light         float64
 11  minute_rem           float64
 12  minute_wake          float64
 13  survey               object 
 14  question_concept_id  int64  
 15  question             object 
 16  answer_concept_id    float64
 17  answer               object 
 18  survey_date          object 
dtypes: bool(1), float64(5), int64(7), object(6)
memory usage: 616.7+ MB


# Dropping question and answer concept IDs

In [19]:
report_missingness(final_df)

Unnamed: 0,Column,Missing Percentage
0,person_id,0.0
1,date,0.0
2,sum_steps,0.0
3,sleep_date,0.0
4,is_main_sleep,0.0
5,minute_in_bed,0.0
6,minute_asleep,0.0
7,minute_after_wakeup,0.0
8,minute_awake,0.0
9,minute_deep,38.969825


In [20]:
final_df.drop(columns=["question_concept_id", "answer_concept_id"], inplace=True)

In [21]:
final_df.to_csv("final_data.csv", index=False)