In [1]:
# Imports

import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Setting numpy random seed (used for deterministic shuffling)

SEED = 0
np.random.seed(SEED)

In [3]:
dataset = os.getenv('WORKSPACE_CDR')
my_bucket = os.getenv('WORKSPACE_BUCKET')

In [4]:
data = pd.read_csv("overall_data.csv")

In [5]:
data.answer.unique()

array(['General Health: Excellent', 'General Health: Good',
       'General Health: Very Good', 'General Health: Fair', 'PMI: Skip',
       'General Health: Poor'], dtype=object)

1. The column *answer* is of type object, therefore it needs to be encoded.
2. Column *is_main_sleep* is of type bool, therefore it needs to be encoded.

In [6]:
answer_option_map = {
    "General Health: Excellent": 5,
    "General Health: Very Good": 4,
    "General Health: Good": 3,
    "General Health: Fair": 2,
    "General Health: Poor": 1,
    "PMI: Skip": 0
}

data["answer"] = data["answer"].map(answer_option_map)

In [7]:
data.answer.value_counts()

4    1547212
3    1147015
5     523857
2     365262
0      63062
1      51742
Name: answer, dtype: int64

In [8]:
data = data[data["answer"] != 0]

In [9]:
data.answer.value_counts()

4    1547212
3    1147015
5     523857
2     365262
1      51742
Name: answer, dtype: int64

In [10]:
# Label encoding strategy for encoding column answer

# le = LabelEncoder()
# data["answer"] = le.fit_transform(data["answer"])

In [11]:
# Converting bool column to numeric

data["is_main_sleep"] = data["is_main_sleep"].astype("int8")

# Accumulating a year's worth data of patient together as a single row

In [12]:
# Dropping the dates (as they are unnecessary)

features = ["answer", "sum_steps", "minute_in_bed", "minute_asleep", "minute_after_wakeup", 
            "minute_awake", "minute_deep", "minute_light", "minute_rem", "minute_wake"]

data = data.drop(columns=["date", "sleep_date", "survey_date", "is_main_sleep"])

In [13]:
data_one = data.groupby("person_id").mean()
data_two = data.groupby("person_id").median()

# Resetting the index to make 'Patient_ID' a column again
data_two.reset_index(inplace=True)

# Splitting the data using 80-20 split

In [14]:
np.random.seed(SEED)

train_test_split_ratio = 0.8

all_pids = data["person_id"].unique()

all_shuffled_pids = all_pids.copy()
np.random.shuffle(all_shuffled_pids)

In [15]:
train_pids = all_pids[: int(len(all_pids) * train_test_split_ratio)]
test_pids = all_pids[int(len(all_pids) * train_test_split_ratio): ]

In [16]:
train_data = data_two[data_two["person_id"].isin(train_pids)]
test_data = data_two[data_two["person_id"].isin(test_pids)]

In [17]:
features = ["answer", "sum_steps", "minute_in_bed", "minute_asleep", 
            "minute_awake", "minute_deep", "minute_light", 
            "minute_rem", "minute_wake"]

train_data = train_data[features]
test_data = test_data[features]

In [18]:
scaler = StandardScaler()

columns_to_normalize = ["sum_steps", "minute_in_bed", "minute_asleep", "minute_awake",
                        "minute_deep", "minute_light", 
                        "minute_rem", "minute_wake"]

train_data_norm = train_data.copy()
test_data_norm = test_data.copy()

train_data_norm[columns_to_normalize] = scaler.fit_transform(train_data_norm[columns_to_normalize])
test_data_norm[columns_to_normalize] = scaler.fit_transform(test_data_norm[columns_to_normalize])

In [19]:
train, test = train_data_norm.to_numpy(), test_data_norm.to_numpy()

train_file_name, test_file_name = "train.npy", "test.npy"
np.save(train_file_name, train)
np.save(test_file_name, test)