<a href="https://www.kaggle.com/code/yaaangzhou/detect-sleep-baseline-modeling?scriptVersionId=142426173" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Created by Yang Zhou**

**BaseLine Modeling📈**

**8 Sep 2023**

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">[Detect Sleep]BaseLine Modeling📈</center>
<p><center style="color:#949494; font-family: consolas; font-size: 20px;">Detect sleep onset and wake from wrist-worn accelerometer data</center></p>

***

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">Overview</center>

# 0. Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import datetime as dt
from copy import deepcopy
from functools import partial
from itertools import combinations
from itertools import groupby

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Statistical Tests
from scipy.stats import f_oneway

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Model Selection
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC

# Models
import optuna
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from lightgbm import LGBMClassifier, LGBMRegressor



import warnings
warnings.filterwarnings('ignore')

In [2]:
# Adjusting plot style

rc = {
    "axes.facecolor": "#F8F8F8",
    "figure.facecolor": "#F8F8F8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

from colorama import Style, Fore
blk = Style.BRIGHT + Fore.BLACK
mgt = Style.BRIGHT + Fore.MAGENTA
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

# 1. Load Data

In [3]:
train = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet")
test  = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv')

# 2. Preprocessing

In [4]:
train["timestamp"] = pd.to_datetime(train["timestamp"],utc=True)
train["hour"] = train["timestamp"].dt.hour

test["timestamp"] = pd.to_datetime(test["timestamp"],utc=True)
test["hour"] = test["timestamp"].dt.hour

In [5]:
train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour
0,08db4255286f,0,2018-11-05 14:00:00+00:00,-30.845301,0.0447,1,14
1,08db4255286f,1,2018-11-05 14:00:05+00:00,-34.181801,0.0443,1,14
2,08db4255286f,2,2018-11-05 14:00:10+00:00,-33.877102,0.0483,1,14
3,08db4255286f,3,2018-11-05 14:00:15+00:00,-34.282101,0.068,1,14
4,08db4255286f,4,2018-11-05 14:00:20+00:00,-34.385799,0.0768,1,14


In [6]:
X = train[["anglez","enmo","hour"]]
y = train["awake"]
pred = test[["anglez","enmo","hour"]]

del train

In [7]:
X.head()

Unnamed: 0,anglez,enmo,hour
0,-30.845301,0.0447,14
1,-34.181801,0.0443,14
2,-33.877102,0.0483,14
3,-34.282101,0.068,14
4,-34.385799,0.0768,14


In [8]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: awake, dtype: int64

# 3. Modeling

In [9]:
xgb_cv_scores, xgb_preds = list(), list()
lgbm_cv_scores, lgbm_preds = list(), list()
rf_cv_scores, rf_preds = list(), list()

kf = KFold(n_splits=2, random_state=42, shuffle=True)

for i, (train_ix, test_ix) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = y.iloc[train_ix], y.iloc[test_ix]
    
    print('---------------------------------------------------------------')
    
    ## XGBoost
    xgb_md = XGBClassifier().fit(X_train, Y_train)
    xgb_pred = xgb_md.predict(X_test)   
    xgb_score_fold = average_precision_score(Y_test, xgb_pred)
    print('Fold', i+1, '==> XGBoost oof score is ==>', xgb_score_fold)
    xgb_cv_scores.append(xgb_score_fold)
    
    ## Pred
    xgb_pred_test = xgb_md.predict_proba(pred)[:, 1]
    xgb_preds.append(xgb_pred_test)
    
    ## LGBM
    lgbm_md = LGBMClassifier().fit(X_train, Y_train)
    lgbm_pred = lgbm_md.predict(X_test) 
    lgbm_score_fold = average_precision_score(Y_test, lgbm_pred)
    print('Fold', i+1, '==> LGBM oof score is ==>', lgbm_score_fold)
    lgbm_cv_scores.append(lgbm_score_fold)

    ## Pred
    lgbm_pred_test = lgbm_md.predict_proba(pred)[:, 1]
    lgbm_preds.append(lgbm_pred_test)
    
    ## RF
    rf_md = RandomForestClassifier().fit(X_train, Y_train)
    rf_pred = rf_md.predict(X_test) 
    rf_score_fold = average_precision_score(Y_test, rf_pred)
    print('Fold', i+1, '==> RF oof score is ==>', rf_score_fold)
    rf_cv_scores.append(rf_score_fold)

    ## Pred
    rf_pred_test = rf_md.predict_proba(pred)[:, 1]
    rf_preds.append(rf_pred_test)
    
print('---------------------------------------------------------------')
print('Average Score of XGBoost model is:', np.mean(xgb_cv_scores))
print('Average Score of LGBM model is:', np.mean(lgbm_cv_scores))
print('Average Score of RF model is:', np.mean(rf_cv_scores))

---------------------------------------------------------------
Fold 1 ==> XGBoost oof score is ==> 0.9279365205327874
Fold 1 ==> LGBM oof score is ==> 0.9223327038710719
Fold 1 ==> RF oof score is ==> 0.9508757898456589
---------------------------------------------------------------
Fold 2 ==> XGBoost oof score is ==> 0.9276036343656224
Fold 2 ==> LGBM oof score is ==> 0.9221822886005724
Fold 2 ==> RF oof score is ==> 0.9506316324495628
---------------------------------------------------------------
Average Score of XGBoost model is: 0.9277700774492049
Average Score of LGBM model is: 0.9222574962358221
Average Score of RF model is: 0.9507537111476109


# 4. Submission

In [10]:
test_predss = np.average(np.array(xgb_cv_scores), axis=0)

In [11]:
test['score'] = test_predss
test["not_awake"] = 1-test["score"]
# exponential smoothing of the predictions
test["smooth"] = test["not_awake"].ewm(span = 100).mean()
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

In [12]:
submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
submission.to_csv('submission.csv', index=False)