In [12]:
import pandas as pd
import numpy as np

train_events = pd.read_csv("child-mind-institute-detect-sleep-states/train_events.csv")

# finding the nan's 
series_has_NaN = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
print(series_has_NaN.value_counts())
no_NaN_series = series_has_NaN[~series_has_NaN].index.tolist()
full_series = series_has_NaN.index.tolist()
print(len(no_NaN_series))

step
True     240
False     37
Name: count, dtype: int64
37
277


In [1]:
import numpy as np
import pandas as pd
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from copy import deepcopy
from modelClass import Classifier


pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


train = pd.read_parquet("child-mind-institute-detect-sleep-states/Zzzs_train.parquet")
test  = pd.read_parquet("child-mind-institute-detect-sleep-states/test_series.parquet")

# parse the timestamp and create an "hour" feature
train["timestamp"] = pd.to_datetime(train["timestamp"],utc=True)
train["hour"] = train["timestamp"].dt.hour

test["timestamp"] = pd.to_datetime(test["timestamp"],utc=True)
test["hour"] = test["timestamp"].dt.hour

train.head()

def create_features(df):
    # parse the timestamp and create an "hour" feature
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = (df["timestamp"].dt.hour).astype('int8')
    df['minute'] = df['timestamp'].dt.minute

    # Calculate the half-hour periods
    df['half_hour'] = (df['hour'] * 2 + (df['minute'] // 30)).astype('int8')
    
    df.drop(columns=['minute'], inplace=True)
    
    # feature cross
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"].astype('float16')
    # "rolling" features
    periods = 50
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["enmo_diff"]   = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["anglez_rolling"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling"]   = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_diff_rolling"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_diff_rolling"]   = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    
    return df

features = ["hour","anglez_times_enmo", "half_hour",
           "anglez","anglez_diff","anglez_rolling","anglez_diff_rolling",
           "enmo","enmo_diff","enmo_rolling","enmo_diff_rolling"]
train = create_features(train)
test = create_features(test)

X_train = train[features]
y_train = train["awake"].astype('int8')
X_test = test[features]

random_state = 42
random_state_list =[42]
n_estimators = 90
device = 'cpu'
early_stopping_rounds = 50
verbose = False
optuna_lgb = False


X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2)

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators=n_estimators, device=device, random_state=random_state)
test_predss = np.zeros((X_test.shape[0]))
oof_predss = np.zeros((X_train.shape[0]))

del X_train

models_name = [_ for _ in classifier.models_name if ('xgb' in _) or ('lgb' in _) or ('cat' in _) or ('rf' in _) or ('lr' in _)]
score_dict = dict(zip(classifier.models_name, [[] for _ in range(len(classifier.models_name))]))

models = classifier.models

# Store oof and test predictions for each base model
oof_preds = []
test_preds = []

# Loop over each base model and fit it
for name, model in models.items():
    if name in ['xgb']:
        model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)
    else:
        model.fit(X_train_, y_train_)

    test_pred = model.predict_proba(X_test)[:, 1]
    y_val_pred = model.predict_proba(X_val)[:, 1]

    score = average_precision_score(y_val, y_val_pred)
    score_dict[name].append(score)
        
    print(f'{name} [SEED-{random_state}] Precision score: {score:.5f}')
        
    oof_preds.append(y_val_pred)
    test_preds.append(test_pred)
    
test_predss = np.average(np.array(test_preds), axis=0)
oof_predss[X_val.index] = np.average(np.array(oof_preds), axis=0)
    
del X_train_, X_val, y_val, y_train_

print(test_predss)


xgb_1 [SEED-42] Precision score: 0.99537
lgb [SEED-42] Precision score: 0.98917
[0.93511541 0.93511541 0.93511541 0.93511541 0.93511541 0.93511541
 0.93511541 0.93511541 0.93511541 0.93511398 0.93511541 0.93511541
 0.93511541 0.93511541 0.93511541 0.93511541 0.93511541 0.93511541
 0.93519671 0.93503534 0.935102   0.9352061  0.93500205 0.93513184
 0.93512635 0.9350576  0.9350576  0.93526326 0.93473055 0.93473055
 0.93473055 0.93473055 0.93474411 0.93478234 0.93478234 0.93481191
 0.93481191 0.93481617 0.93481617 0.93482913 0.93484883 0.93482428
 0.93482428 0.93484865 0.93457987 0.93445505 0.934801   0.93497242
 0.93491404 0.93494235 0.93494235 0.9348812  0.9348812  0.93492131
 0.93492131 0.93492131 0.93492131 0.93492131 0.93483408 0.93483408
 0.93483408 0.93483408 0.93483408 0.93483408 0.93483408 0.93483408
 0.9348229  0.9348229  0.93505039 0.9349363  0.9349363  0.93508457
 0.93495427 0.93509676 0.93508457 0.93506919 0.93506967 0.93511017
 0.93508511 0.93508522 0.93512719 0.93512719 0.93

In [2]:
# Add a "not_awake" column as the complement of the "score" column:
test['score'] = test_predss
test["not_awake"] = 1 - test["score"]

# Smoothing of the predictions:
smoothing_length = 400  # Define the length for smoothing
test["smooth"] = test["not_awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")

# Re-binarize the "smooth" column:
test["smooth"] = test["smooth"].round()



In [5]:
test.loc[test["smooth"] != 0]

Unnamed: 0,series_id,step,timestamp,anglez,enmo,hour,half_hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_rolling,enmo_rolling,anglez_diff_rolling,enmo_diff_rolling,score,not_awake,smooth


In [9]:
# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

print(test.loc[test["event"] != 0])


Empty DataFrame
Columns: [series_id, step, timestamp, anglez, enmo, hour, half_hour, anglez_times_enmo, anglez_diff, enmo_diff, anglez_rolling, enmo_rolling, anglez_diff_rolling, enmo_diff_rolling, score, not_awake, smooth, event]
Index: []


In [None]:

sample_submission = test.loc[test["event"] != 0]
sample_submission = sample_submission[["series_id", "step", "event", "score"]].copy()
sample_submission = sample_submission.reset_index(drop=True).reset_index(names="row_id")

# Save the sample submission DataFrame to a CSV file:
sample_submission.to_csv('submission.csv', index=False)