In [1]:
# Imports

import pandas as pd
import numpy as np

In [2]:
def report_missingness(df: pd.DataFrame) -> dict:
    missing_percentage = df.isnull().mean() * 100

    # Create a DataFrame to display the results
    missing_info = pd.DataFrame({
        'Column': missing_percentage.index,
        'Missing Percentage': missing_percentage.values
    })

    # Print the result
    return missing_info

In [3]:
df = pd.read_csv("final_data.csv")

In [4]:
%%time

# Strategy for imputing NaNs: Impute mean (of each patient) of each columns for NaN values

final_df = df.groupby("person_id").transform(lambda x: x.fillna(x.mean()))
final_df = pd.concat([df[["person_id", "answer", "date", "sleep_date", "survey_date"]], final_df], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 32.3 s, sys: 1.85 s, total: 34.2 s
Wall time: 34.2 s


In [5]:
report_missingness(final_df)

Unnamed: 0,Column,Missing Percentage
0,person_id,0.0
1,answer,0.0
2,date,0.0
3,sleep_date,0.0
4,survey_date,0.0
5,sum_steps,0.0
6,is_main_sleep,0.0
7,minute_in_bed,0.0
8,minute_asleep,0.0
9,minute_after_wakeup,0.0


# This implies that there are patients who not have any values for 'minute_deep', 'minute_light', 'minute_rem', and 'minute_wake'

In [6]:
final_df = final_df.dropna().reset_index(drop=True)

In [7]:
report_missingness(final_df)

Unnamed: 0,Column,Missing Percentage
0,person_id,0.0
1,answer,0.0
2,date,0.0
3,sleep_date,0.0
4,survey_date,0.0
5,sum_steps,0.0
6,is_main_sleep,0.0
7,minute_in_bed,0.0
8,minute_asleep,0.0
9,minute_after_wakeup,0.0


In [8]:
final_df.to_csv("overall_data.csv", index=False)