<a href="https://www.kaggle.com/code/yaaangzhou/zzz-baseline-xgb-model-for-beginners?scriptVersionId=142576817" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Created by Yang Zhou**

**[ZZz]Baseline Model for beginners**

**11 Sep 2023**

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">[ZZz]Baseline Model for beginners</center>
<p><center style="color:#949494; font-family: consolas; font-size: 20px;">Detect sleep onset and wake from wrist-worn accelerometer data</center></p>

***

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">Previous Work</center>

1. In my previous work, I completed the parte of EDA and the extraction of training data. This Notebook will only contain the part of building the model.
2. My preivous Notebook:
- [Memory Reduce, Preprocessing and EDA](https://www.kaggle.com/code/yaaangzhou/zzz-memory-reduce-preprocessing-and-eda)
- [Clean dataset for modeling](https://www.kaggle.com/code/yaaangzhou/zzz-clean-dataset-for-modeling)
- From this [url](https://www.kaggle.com/datasets/yaaangzhou/zzz52-series-train-data) you can find the final dataset for modeling.

# 0. Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import datetime as dt
from copy import deepcopy
from functools import partial
from itertools import combinations
from itertools import groupby

# Model Selection
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score

# Models
import optuna
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

# 1. Load Data

In [2]:
train = pd.read_parquet("/kaggle/input/zzz52-series-train-data/57series_data.parquet")
test  = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv')

# 2. Preprocessing

In [3]:
def data_preprocessing(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = df["timestamp"].dt.hour
    # feature cross
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    # "rolling" features
    periods = 50
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"]   = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    df["anglez_rolling"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_rolling"]   = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["anglez_diff_rolling"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_diff_rolling"]   = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    
    return df

In [4]:
train = data_preprocessing(train)
test = data_preprocessing(test)

In [5]:
train.sample(3)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
10650476,a9a2f7fac455,556436,2017-11-29 01:34:40+00:00,22.944,0.3445,1,1,7.904208,66.614105,0.2954,-10.493716,0.107918,8.235094,-0.099216
15418987,7df249527c63,136087,2017-11-14 17:45:35+00:00,0.1309,0.0577,1,17,0.007553,-23.998201,0.0555,-11.98451,0.078106,-21.026764,0.028206
4672092,55a47ff9dc8a,22332,2018-02-17 01:01:00+00:00,-2.2113,0.023,1,1,0.05086,-4.2325,-0.0184,0.674746,0.025424,-0.782752,0.000212


In [6]:
test.sample(3)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
55,038441c925bb,55,2018-08-14 19:34:35+00:00,-79.993202,0.0135,19,1.079908,-82.629906,-0.0082,-80.011252,0.013508,-86.856677,-0.007742
93,038441c925bb,93,2018-08-14 19:37:45+00:00,-79.988197,0.0128,19,1.023849,-0.0103,-0.0007,-80.065726,0.01309,-24.111232,-0.002082
389,0402a003dae9,89,2018-12-18 17:52:25+00:00,-8.8888,0.0623,17,0.553772,-35.908199,-0.1167,-34.508904,0.074056,-19.298324,-0.015162


In [7]:
features = ["hour","anglez_times_enmo","anglez","anglez_diff","anglez_rolling","anglez_diff_rolling","enmo","enmo_diff","enmo_rolling","enmo_diff_rolling"]

# 3. Modeling

In [8]:
%%time

classifier = RandomForestClassifier(n_estimators=50,
                                    max_depth=4,
                                    min_samples_leaf=100)

X_train = train[features]
y_train = train["awake"]

classifier.fit(X_train, y_train)

CPU times: user 40min 28s, sys: 41.2 s, total: 41min 10s
Wall time: 41min 10s


In [9]:
X_test = test[features]
test["score"] = classifier.predict_proba(X_test)[:,1]

In [10]:
test["not_awake"] = 1-test["score"]
# smoothing of the predictions
smoothing_length = 2*250
test["smooth"] = test["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

In [11]:
sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
sample_submission.to_csv('submission.csv', index=False)