<a href="https://www.kaggle.com/code/yaaangzhou/zzz-baseline-xgb-model-for-beginners?scriptVersionId=142661949" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Created by Yang Zhou**

**[ZZz]Baseline Model for beginners**

**11 Sep 2023**

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">[ZZz]Baseline Model for beginners</center>
<p><center style="color:#949494; font-family: consolas; font-size: 20px;">Detect sleep onset and wake from wrist-worn accelerometer data</center></p>

***

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">Previous Work</center>

1. In my previous work, I completed the parte of EDA and the extraction of training data. This Notebook will only contain the part of building the model.
2. My preivous Notebook:
- [Memory Reduce, Preprocessing and EDA](https://www.kaggle.com/code/yaaangzhou/zzz-memory-reduce-preprocessing-and-eda)
- [Clean dataset for modeling](https://www.kaggle.com/code/yaaangzhou/zzz-clean-dataset-for-modeling)
- From this [url](https://www.kaggle.com/datasets/yaaangzhou/zzz52-series-train-data) you can find the final dataset for modeling.

# 0. Imports

In [1]:
# Misc
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import gc
from copy import deepcopy
from functools import partial
from itertools import combinations
from itertools import groupby

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.model_selection import cross_validate
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score

# Import libraries for Hypertuning
import optuna

#Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from catboost import Pool



# Useful line of code to set the display option so we could see all the columns in pd dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)



# 1. Load Data

In [2]:
train = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train_multi.parquet")
test  = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv')

# 2. Preprocessing

In [3]:
def data_preprocessing(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = df["timestamp"].dt.hour
    df["month"] = df["timestamp"].dt.month
    df["weekday"] = df["timestamp"].dt.weekday
    # feature cross
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    # "rolling" features
    periods = 50
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"]   = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    df["anglez_rolling"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_rolling"]   = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["anglez_diff_rolling"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_diff_rolling"]   = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    
    return df

In [4]:
train = data_preprocessing(train)
test = data_preprocessing(test)

In [5]:
test.sample(3)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,month,weekday,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling
323,0402a003dae9,23,2018-12-18 17:46:55+00:00,-46.281502,0.0753,17,12,1,3.484997,78.409401,-0.0682,-24.476246,0.091334,75.273025,-0.065472
387,0402a003dae9,87,2018-12-18 17:52:15+00:00,-2.2949,0.0942,17,12,1,0.21618,49.869202,-0.057,-34.309766,0.075324,-21.37401,-0.011952
131,038441c925bb,131,2018-08-14 19:40:55+00:00,-80.158798,0.0134,19,8,1,1.074128,-0.186501,0.0007,-65.240312,0.022324,-15.355904,-0.009668


In [6]:
features = train.columns.drop(['series_id','step','timestamp','awake'])
features

Index(['anglez', 'enmo', 'hour', 'month', 'weekday', 'anglez_times_enmo',
       'anglez_diff', 'enmo_diff', 'anglez_rolling', 'enmo_rolling',
       'anglez_diff_rolling', 'enmo_diff_rolling'],
      dtype='object')

# 3. Modeling

In [7]:
X_train = train[features]
y_train = train["awake"]

X_test = test[features]

del train

In [8]:
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=42):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        xgb_1 = {
            'n_estimators': self.n_estimators,
            'eval_metric': 'map',
            'verbosity': 0,
            'random_state': self.random_state,
            'scale_pos_weight': 2/3
        }
        
        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
       
        models = {
            'xgb_1': xgb.XGBClassifier(**xgb_1),
            'lgbm_1': lgb.LGBMClassifier(random_state=self.random_state),
            'rf': RandomForestClassifier(n_estimators=50, random_state=self.random_state),
            'lr': LogisticRegression(max_iter=50, random_state=self.random_state, n_jobs=-1),
        }
        
        return models

In [9]:
# Config
random_state = 42
random_state_list =[42]
n_estimators = 100
device = 'cpu'
early_stopping_rounds = 100
verbose = False

In [10]:
%%time

X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2)

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators=n_estimators, device=device, random_state=random_state)
test_predss = np.zeros((X_test.shape[0]))
oof_predss = np.zeros((X_train.shape[0]))

del X_train

ensemble_score = []
weights = []
models_name = [_ for _ in classifier.models_name if ('xgb' in _) or ('lgb' in _) or ('cat' in _)]
trained_models = dict(zip(models_name, [[] for _ in range(classifier.len_models)]))
score_dict = dict(zip(classifier.models_name, [[] for _ in range(len(classifier.models_name))]))

models = classifier.models

# Store oof and test predictions for each base model
oof_preds = []
test_preds = []

# Loop over each base model and fit it
for name, model in models.items():
    if name in ['xgb', 'lgb', 'cat', 'lgb2']:
        model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)
    else:
        model.fit(X_train_, y_train_)
         
    if name in trained_models.keys():
        trained_models[f'{name}'].append(deepcopy(model))

    test_pred = model.predict_proba(X_test)[:, 1]
    #y_val_pred = model.predict_proba(X_val)[:, 1]

    #score = average_precision_score(y_val, y_val_pred)
    #score_dict[name].append(score)
        
   # print(f'{name} [SEED-{random_state}] Precision score: {score:.5f}')
        
    #oof_preds.append(y_val_pred)
    test_preds.append(test_pred)
    
test_predss = np.average(np.array(test_preds), axis=0)
#oof_predss[X_val.index] = np.average(np.array(oof_preds), axis=0)
    
gc.collect()
del X_train_, X_val, y_train_, y_val 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CPU times: user 3h 29min 51s, sys: 2min 1s, total: 3h 31min 52s
Wall time: 2h 30min 41s


In [11]:
test['score'] = test_predss
test["not_awake"] = 1-test["score"]
# exponential smoothing of the predictions
smoothing_length = 400
test["smooth"] = test["not_awake"].ewm(span = 100).mean()
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

In [12]:
sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
sample_submission.to_csv('submission.csv', index=False)

In [13]:
sample_submission

Unnamed: 0,row_id,series_id,step,event,score
