<a href="https://www.kaggle.com/code/yaaangzhou/zzz-baseline-xgb-model-for-beginners?scriptVersionId=142622512" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Created by Yang Zhou**

**[ZZz]Baseline Model for beginners**

**11 Sep 2023**

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">[ZZz]Baseline Model for beginners</center>
<p><center style="color:#949494; font-family: consolas; font-size: 20px;">Detect sleep onset and wake from wrist-worn accelerometer data</center></p>

***

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">Previous Work</center>

1. In my previous work, I completed the parte of EDA and the extraction of training data. This Notebook will only contain the part of building the model.
2. My preivous Notebook:
- [Memory Reduce, Preprocessing and EDA](https://www.kaggle.com/code/yaaangzhou/zzz-memory-reduce-preprocessing-and-eda)
- [Clean dataset for modeling](https://www.kaggle.com/code/yaaangzhou/zzz-clean-dataset-for-modeling)
- From this [url](https://www.kaggle.com/datasets/yaaangzhou/zzz52-series-train-data) you can find the final dataset for modeling.

# 0. Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import datetime as dt
from copy import deepcopy
from functools import partial
from itertools import combinations
from itertools import groupby

# Model Selection
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score

# Models
import optuna
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

# 1. Load Data

In [2]:
train = pd.read_parquet("/kaggle/input/zzz52-series-train-data/57series_data.parquet")
test  = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv')

# 2. Preprocessing

In [3]:
def data_preprocessing(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = df["timestamp"].dt.hour
    # feature cross
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    # "rolling" features
    periods = 50
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"]   = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    df["anglez_rolling"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_rolling"]   = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["anglez_diff_rolling"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_diff_rolling"]   = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    
    return df

In [4]:
train = data_preprocessing(train)
test = data_preprocessing(test)

In [5]:
train.sample(3)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
3091457,31011ade7c0a,246557,2017-08-30 03:11:25+00:00,54.4828,0.0,1,3,0.0,119.781906,-0.0383,19.444922,0.01144,42.189915,-0.00556
3537037,3452b878e596,127297,2019-06-04 01:18:05+00:00,2.7177,0.0,0,1,0.0,-12.6855,-0.0015,8.597876,0.005514,-9.810602,0.003324
16874216,18b61dd5aae8,271376,2018-01-07 14:24:40+00:00,-19.2248,0.0144,1,14,0.276837,51.988701,0.0144,-28.522126,0.01908,35.566186,0.00398


In [6]:
test.sample(3)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
115,038441c925bb,115,2018-08-14 19:39:35+00:00,-80.028603,0.013,19,1.040372,-0.005104,-0.0008,-80.125804,0.01311,-0.112607,-0.000276
417,0402a003dae9,117,2018-12-18 17:54:45+00:00,-50.928699,0.0227,17,1.156082,-25.8841,-0.0454,-43.029748,0.060942,-18.542974,0.00174
58,038441c925bb,58,2018-08-14 19:34:50+00:00,-80.008102,0.0139,19,1.112113,-82.806099,-0.0084,-80.011634,0.013482,-81.897974,-0.007294


In [7]:
features = ["hour","anglez_times_enmo","anglez","anglez_diff","anglez_rolling","anglez_diff_rolling","enmo","enmo_diff","enmo_rolling","enmo_diff_rolling"]

# 3. Modeling

In [8]:
X = train[features]
y = train["awake"]

pred = test[features]

In [9]:
%%time

xgb_cv_scores, xgb_preds = list(), list()
lgbm_cv_scores, lgbm_preds = list(), list()
rf_cv_scores, rf_preds = list(), list()

kf = KFold(n_splits=3, random_state=42, shuffle=True)

for i, (train_ix, test_ix) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = y.iloc[train_ix], y.iloc[test_ix]
    
    print('---------------------------------------------------------------')
    
    ## XGBoost
    xgb_md = XGBClassifier().fit(X_train, Y_train)
    xgb_pred = xgb_md.predict(X_test)   
    xgb_score_fold = average_precision_score(Y_test, xgb_pred)
    print('Fold', i+1, '==> XGBoost oof score is ==>', xgb_score_fold)
    xgb_cv_scores.append(xgb_score_fold)
    
    ## Pred
    xgb_pred_test = xgb_md.predict_proba(pred)[:, 1]
    xgb_preds.append(xgb_pred_test)
    
    ## LGBM
    lgbm_md = LGBMClassifier().fit(X_train, Y_train)
    lgbm_pred = lgbm_md.predict(X_test) 
    lgbm_score_fold = average_precision_score(Y_test, lgbm_pred)
    print('Fold', i+1, '==> LGBM oof score is ==>', lgbm_score_fold)
    lgbm_cv_scores.append(lgbm_score_fold)

    ## Pred
    lgbm_pred_test = lgbm_md.predict_proba(pred)[:, 1]
    lgbm_preds.append(lgbm_pred_test)
    
    ## RF
    #rf_md = RandomForestClassifier().fit(X_train, Y_train)
    #rf_pred = rf_md.predict(X_test) 
    #rf_score_fold = average_precision_score(Y_test, rf_pred)
    #print('Fold', i+1, '==> RF oof score is ==>', rf_score_fold)
    #rf_cv_scores.append(rf_score_fold)

    ## Pred
    #rf_pred_test = rf_md.predict_proba(pred)[:, 1]
    #rf_preds.append(rf_pred_test)
    
print('---------------------------------------------------------------')
print('Average Score of XGBoost model is:', np.mean(xgb_cv_scores))
print('Average Score of LGBM model is:', np.mean(lgbm_cv_scores))
# print('Average Score of RF model is:', np.mean(rf_cv_scores))

---------------------------------------------------------------
Fold 1 ==> XGBoost oof score is ==> 0.9564518980199761
Fold 1 ==> LGBM oof score is ==> 0.9522723309930069
---------------------------------------------------------------
Fold 2 ==> XGBoost oof score is ==> 0.9563358085359192
Fold 2 ==> LGBM oof score is ==> 0.9523297909132913
---------------------------------------------------------------
Fold 3 ==> XGBoost oof score is ==> 0.9565268143958596
Fold 3 ==> LGBM oof score is ==> 0.952432410008366
---------------------------------------------------------------
Average Score of XGBoost model is: 0.9564381736505849
Average Score of LGBM model is: 0.9523448439715546
CPU times: user 2h 13min 56s, sys: 2min 43s, total: 2h 16min 39s
Wall time: 1h 26min 22s


In [10]:
test_predss = (np.average(np.array(xgb_preds), axis=0)+np.average(np.array(lgbm_preds), axis=0))/2

In [11]:
test['score'] = test_predss
test["not_awake"] = 1-test["score"]
# exponential smoothing of the predictions
test["smooth"] = test["not_awake"].ewm(span = 100).mean()
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

In [12]:
submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
submission.to_csv('submission.csv', index=False)

In [13]:
submission

Unnamed: 0,row_id,series_id,step,event,score
