In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import random

import dask
import dask.array as da
import dask.dataframe as dd

In [None]:
%%time
import polars as pl
train_series = (pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

### Memory reduction
The following function has been taken from the notebook 
<a 
   href=https://www.kaggle.com/code/renatoreggiani/feat-eng-ideas-60-memory-reduction-cmi>
    🛠Feat Eng ideas & 60% memory reduction 🛠- CMI
</a>

In [None]:
from pandas.api.types import is_datetime64_ns_dtype
import gc

import warnings
warnings.filterwarnings("ignore")

def reduce_mem_usage(df):
    
    """ 
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df[col]):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        

    df['series_id'] = df['series_id'].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f'Decreased by {decrease:.2f}%')
    
    return df

In [None]:
train_series = reduce_mem_usage(train_series)

In [None]:
train_series.head()

In [None]:
train_events = (pl.scan_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

In [None]:
train_events.head()

# Preprocess the data

## Sort the datasets by the timestamp

In [None]:
train_series = train_series.sort_values('timestamp').reset_index(drop=True)
train_series.head()

In [None]:
train_events = train_events.sort_values('timestamp').reset_index(drop=True)
train_events.head()

## Sleep hours distribution

Create the dataframe with the sleep hours as follow: <br>
- Make the dfference between the rows of the timestamp columns
- Extract the hour and day components from the difference
- Encode the events and make the difference
- Create a pandas dataframe named train_sleep 

Since there are missing values in the dataframe that can make the difference big, it is good to remove the rows with day > 0 (sssuming they have never slept for more than 24 hours) <br>
After the difference, the events are equal to -1 and +1, corresponding to hours of sleep and hours of wakefulness, respectively. <br>
For our purpose, only the events = -1 will be kept.

In [None]:
### Difference between the rows of the timestamp column
train_events_hours = train_events.timestamp.diff().dt.components['hours']   # Extract the hour component from the difference
train_events_days  = train_events.timestamp.diff().dt.components['days']    # Extract the day component from the difference
train_events_event = train_events.event.map({'onset':1, 'wakeup':0}).diff() # Encode the events and make the difference

In [None]:
### Create the pandas dataframe
train_sleep = pd.DataFrame([train_events.month, train_events.day, train_events_days, train_events_hours, train_events_event], 
                           index=['month', 'day', 'days_sleep', 'hours_sleep', 'event']).T.dropna()
train_sleep = train_sleep[(train_sleep.days_sleep == 0) & (train_sleep.event == -1)].reset_index(drop=True)

In [None]:
train_sleep.head()

In [None]:
train_sleep.describe()

## Interpolation of the timestamp

In [None]:
train_events.head(15)

In [None]:
train_events['timestamp'] = train_events.timestamp.interpolate()

In [None]:
train_events.head(15)

## Merge the event and series datasets

In [None]:
train_series.shape

In [None]:
train_events_tomerge = train_events[['timestamp', 'event']]
train_events_tomerge.head()

In [None]:
train_events_tomerge.isna().sum()

In [None]:
train_series_merged = train_series.merge(train_events_tomerge, how='left', on='timestamp')

In [None]:
train_series_merged.head()

## Fill the missing values in the event column

Using the method 'ffill', the missing values will be replace with the previous non-missing value

In [None]:
train_series_merged['event'] = train_series_merged.event.fillna(method='ffill')

Acting like this,the first rows with missing values are not replaced because there is no value before them. <br>
Checking the train_events dataset, the first event is 'onset', that correspond to the beginning of sleep.  <br>
For this reason the event before 'onset' is 'wakeup'. <br>
Let's fill the remaining rows with missing values with 'wakeup'. <br>

In [None]:
train_series_merged['event'] = train_series_merged.event.fillna('wakeup')

Check for missing values in the event column

In [None]:
train_series_merged.event.isna().sum()

# Explore the data

## Data information

In [None]:
train_events.info()

## Explore the train_events dataset

In [None]:
train_events.describe()

In [None]:
print('Number of unique identifiers for each series of accelerometer data:', len(train_events.series_id.unique()))

### Hours distribution

In [None]:
plt.figure(figsize=(10,4))
# plt.subplot(121)
plt.title('Hour distribution with onset and wakeup')
sns.histplot(x=train_events.dropna().hour, hue=train_events.dropna().event, stat='density', bins=24, binrange=(-0.5, 23.5))
sns.kdeplot(train_events.dropna().hour, bw_adjust=0.45)

In [None]:
plt.figure(figsize=(10,4))
plt.title('Sleep hours distribution')
sns.histplot(x=train_sleep.hours_sleep, stat='density', bins=19, binrange=(-0.5, 18.5))
sns.kdeplot(train_sleep.hours_sleep, bw_adjust=2, color='red')

In [None]:
plt.figure(figsize=(15,3))
plt.title('Average sleep hours per day')
sns.barplot(x=train_sleep.groupby('day').hours_sleep.mean().index, y=train_sleep.groupby('day').hours_sleep.mean().values)

In [None]:
plt.figure(figsize=(10,3))
plt.title('Average sleep hours per month')
sns.barplot(x=train_sleep.groupby('month').hours_sleep.mean().index, y=train_sleep.groupby('month').hours_sleep.mean().values)

The average number of sleep hour is about 8 hours. There is no distinction between days and months. It could have been hypothesized that in the summer months and on weekends, the hours of sleep were greater. This is because, considering that in these days the parents, not working, would have been able to sleep more, the children's sleeping hours would have increased. This hypothesis is not visible in the data and is therefore rejected.

## Explore the train_series_merged dataset

In [None]:
train_series.info()

In [None]:
train_series.isna().sum()

In [None]:
print(f'In the dataset there are {len(train_series.series_id.unique())} accelerometer series')

In [None]:
plt.figure(figsize=(14,3))
sns.barplot(x=train_series.groupby('series_id').anglez.mean(),
            y=train_series.groupby('series_id').anglez.mean())
g = plt.xticks(ticks=[])

Take 5 random series_id and explore them

In [None]:
series_id_selected = random.choices(train_series_merged.series_id.unique(), k=5)
series_id_selected

Extract a subset of the dataset with the selected series id

In [None]:
train_series_sub = train_series_merged[train_series_merged.series_id.isin(series_id_selected)].reset_index(drop=True)
train_series_sub.shape

## Convert from datetime to timestamp

In [None]:
train_series_sub['timestamp_new'] = train_series_sub.timestamp.astype(int).div(10**9)

In [None]:
train_series_sub.head()

In [None]:
%%time
plt.figure(figsize=(12,3))
sns.scatterplot(data=train_series_sub[train_series_sub.series_id == series_id_selected[0]],
             x='timestamp', y='anglez', hue='event')

In [None]:
%%time
plt.figure(figsize=(12,3))
sns.scatterplot(x=train_series_sub[train_series_sub.series_id == series_id_selected[0]].timestamp,
                y=train_series_sub[train_series_sub.series_id == series_id_selected[0]].anglez*train_series_sub[train_series_sub.series_id == series_id_selected[0]].enmo,
                hue=train_series_sub[train_series_sub.series_id == series_id_selected[0]].event)

In [None]:
%%time
for ids in series_id_selected:
    plt.figure(figsize=(12,3))
    plt.title(f'z-angle distribution with events for {ids} id')
    sns.histplot(data=train_series_sub[train_series_sub.series_id == ids],
                 x=train_series_sub[train_series_sub.series_id == ids].anglez.abs(), hue='event', fill=True, bins=100, alpha=0.3)
    plt.show()

In [None]:
%%time
for ids in series_id_selected:
    plt.figure(figsize=(12,3))
    plt.title(f'z-angle distribution with events for {ids} id')
    sns.histplot(data=train_series_sub[train_series_sub.series_id == ids],
                 x='enmo', hue='event', fill=True, bins=100, alpha=0.3)
    plt.yscale('log')
    plt.show()

From the z-angle distribution follows that: <br>
- the wakeup probability decrases as the absolute value of the angle increases. 
- the onset probability is almost constant for all the angle values, except for a few peaks

From the ENMO distribution both the event count decreases as the enmo encreases

In [None]:
%%time
plt.figure(figsize=(12,3))
plt.title('z-angle distribution with events')
sns.histplot(data=train_series_sub[train_series_sub.series_id == series_id_selected[0]],
             x=np.sin(train_series_sub[train_series_sub.series_id == series_id_selected[0]].anglez)*\
             (train_series_sub[train_series_sub.series_id == series_id_selected[0]].enmo), 
             hue='event', fill=True, bins=100, alpha=0.3)
plt.yscale('log')

In the ENMO, all the zero values correspond to zero or negative values rounded to zero.

In [None]:
train_series_merged.memory_usage().sum() / 1024**2

# Preprocess the test dataset

In [None]:
%%time
test_series = (pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
#                         (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

In [None]:
test_series = reduce_mem_usage(test_series)

# Save the datasets

In [None]:
train_series_merged.to_parquet('/kaggle/working/train_series_preprocessed.parquet')
test_series.to_parquet('/kaggle/working/test_series_preprocessed.parquet')

Since the dataset is large, to avoid long times, the model will be made in the notebook <a href=https://www.kaggle.com/code/enricomanosperti/detect-sleep-states-preprocessing-and-eda> Detect Sleep States: Model</a> <br>
The saved dataset is <a href=https://www.kaggle.com/datasets/enricomanosperti/sleep-states-preprocessed-dataset> Sleep States Preprocessed Dataset</a>
