# Import libraries and prepare data

In [1]:
import numpy as np 
import pandas as pd 
from trendfilter import trend_filter

# Data

In [2]:
train_events = pd.read_csv("train_events.csv")

In [3]:
series_has_NaN = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
no_NaN_series = series_has_NaN[~series_has_NaN].index.tolist()
# also drop these two "truncated" events series seen in EDA:
no_NaN_series.remove('31011ade7c0a') # incomplete events data
no_NaN_series.remove('a596ad0b82aa') # incomplete events data

In [4]:
def get_train_series(series):
    train_series = pd.read_parquet("train_series.parquet", filters=[('series_id','=',series)])
    train_events = pd.read_csv("train_events.csv").query('series_id == @series')
    
    train_events = train_events.dropna()
    train_events["step"]  = train_events["step"].astype("int")
    train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})

    train = pd.merge(train_series, train_events[['step','awake']], on='step', how='left')
    train["awake"] = train["awake"].bfill(axis ='rows')
    # final section:
    # train_events.groupby('series_id').tail(1)["event"].unique()
    # Result: the last event is always a "wakeup"
    train['awake'] = train['awake'].fillna(1) # awake
    train["awake"] = train["awake"].astype("int")
    train['timestamp'] = pd.to_datetime(train['timestamp'].apply(lambda x: str(x).rsplit('-', 1)[0]))

    return(train)

In [5]:
train = get_train_series(no_NaN_series[1])

In [6]:
test = pd.read_parquet("test_series.parquet")
test['timestamp'] = pd.to_datetime(test['timestamp'].apply(lambda x: str(x).rsplit('-', 1)[0]))

In [7]:
series_ids = test['series_id'].unique()

### Features

In [8]:
def feature_extraction(df, w):
    median_enmo = df.enmo.rolling(w).median()
    median_enmo_before = median_enmo.iloc[w:-w] 
    median_enmo_after = median_enmo.iloc[(2*w):]

    diff_anglez = pd.Series(np.abs(np.diff(df.anglez, prepend = 0)))
    median_diff_anglez = diff_anglez.rolling(w).median()
    median_diff_anglez_before = median_diff_anglez.iloc[w:-w]
    median_diff_anglez_after = median_diff_anglez.iloc[(2*w):]

    # Feature matrix 
    X = np.transpose(np.array([median_enmo_before.values, 
                               median_enmo_after.values, 
                               median_diff_anglez_before.values, 
                               median_diff_anglez_after.values,
                              ]
                             )
                    )
    return(X)

# Fit

In [9]:
w = 5
X_train = feature_extraction(train, w)
y_train = train.awake[w:-w]

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predict

In [12]:
output = pd.DataFrame(columns = ['series_id', 'step', 'event', 'score'])

for i in range(len(series_ids)):
    test_data = test[test.series_id == series_ids[i]]
    X_test = feature_extraction(test_data, w)
    X_test = scaler.transform(X_test)
    pred = clf.predict(X_test)
    
    x = np.linspace(0, len(pred), len(pred))
    tf = trend_filter(x, pred, l_norm=1, alpha_1=150)
    pred = np.round(tf['y_fit'])

    pred_fall_asleep_step = np.where(pred[1:] - pred[:-1] == -1)[0]
    pred_wakeup_step = np.where(pred[1:] - pred[:-1] == +1)[0]
    mask_midnight = (test_data.timestamp.dt.hour == 0) & (test_data.timestamp.dt.minute == 0) & (test_data.timestamp.dt.second == 0)
    midnight = test_data[mask_midnight]['step'].values
    #print(pred_fall_asleep_step)

    filtered_pred_fall_asleep_step = pred_fall_asleep_step.copy()
    filtered_wakeup_step = pred_wakeup_step.copy()

    night_range = 10000  # Steps corresponding to 12 hours
    keep_list = []
    # Iterate through midnight times
    for night_step in midnight:
        valid_steps = np.arange(night_step - night_range, night_step + night_range)

        # Filter valid sleep and wake-up steps based on the valid range
        valid_fall_asleep = pred_fall_asleep_step[np.isin(pred_fall_asleep_step, valid_steps)]
        valid_wakeup = pred_wakeup_step[np.isin(pred_wakeup_step, valid_steps)]

        # Initialize variables to track sleep phase information
        index_valid_fall_asleep = 0
        index_valid_wakeup = 0
        longest_sleep_phase = 0
        current_sleep = None
        current_awake = None

        # Iterate through valid sleep and wake-up steps
        while index_valid_fall_asleep < len(valid_fall_asleep) and index_valid_wakeup < len(valid_wakeup):
            if valid_fall_asleep[index_valid_fall_asleep] < valid_wakeup[index_valid_wakeup]:
                if (valid_wakeup[index_valid_wakeup] - valid_fall_asleep[index_valid_fall_asleep] > longest_sleep_phase):
                    current_sleep = valid_fall_asleep[index_valid_fall_asleep]
                    current_awake = valid_wakeup[index_valid_wakeup]
                    longest_sleep_phase = valid_wakeup[index_valid_wakeup] - valid_fall_asleep[index_valid_fall_asleep]
                index_valid_fall_asleep += 1
            else:
                index_valid_wakeup += 1

        keep_list.append(current_sleep)
        keep_list.append(current_awake)

    filtered_pred_fall_asleep_step = [x for x in filtered_pred_fall_asleep_step if x in keep_list]
    filtered_pred_wakeup_step = [x for x in filtered_wakeup_step if x in keep_list]
    
    #print(filtered_pred_fall_asleep_step)
    
    output = pd.concat([output, 
                        pd.DataFrame({'series_id' : np.repeat(series_ids[i], len(filtered_pred_fall_asleep_step)),
                                        'step' : filtered_pred_fall_asleep_step,
                                        'event' : np.repeat('onset', len(filtered_pred_fall_asleep_step)),
                                        'score' : np.repeat(1.0, len(filtered_pred_fall_asleep_step))}),
                        pd.DataFrame({'series_id' : np.repeat(series_ids[i], len(filtered_pred_wakeup_step)),
                                        'step' : filtered_pred_wakeup_step,
                                        'event' : np.repeat('awake', len(filtered_pred_wakeup_step)),
                                        'score' : np.repeat(1.0, len(filtered_pred_wakeup_step))})])


In [13]:
output.to_csv('submission.csv', index_label = 'row_id')