In [1]:
import pandas as pd
import numpy as np
import datetime
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler,StandardScaler
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, Lambda
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go

2023-02-24 10:54:55.207972: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Task 1: Predict time used in seconds per hour

### Data and Model Preperation

In [2]:
df = pd.read_csv("../data/processed/new_lstm_dataset_local.csv", parse_dates=['Start', 'End'])
df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
df = df.drop(columns='Duration')
delta = datetime.timedelta(hours=1)
df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

In [3]:
def helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + helper(row2)

def func(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(helper(row))
    return pd.DataFrame([row])

In [4]:
processed_df = pd.concat([func(row) for _, row in df.iterrows()], ignore_index=True)

df_1 = processed_df[processed_df['Value'] == 'firefox.exe'].reset_index()
df_1 = df_1[['Start', 'Time_diff_sec']]
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum()
fig = go.Figure([go.Scatter(x=temp.index, y=temp.values)], layout=layout)
fig.show()

#### For experiments, we keep our model structure the same

In [5]:
def train_model(X_train, y_train, num_epochs = 400, n_steps=1):
    feature_shape = X_train.shape[2]

    model = keras.Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
    # model.add(Dropout(0.2))

    # model.add(LSTM(32, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16))
    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(1)))
    model.add(Dense(32))
    model.add(Dense(16))
    model.add(Dense(1))
    opt = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=opt, loss='mse')

    history = model.fit(X_train, y_train,epochs=num_epochs, verbose=0)
    return model, history

In [6]:
def get_accuracy(pred, target, bound=10):
    return np.mean(abs(pred - target) < bound)

### Experiment 1: Vanilla Approach
- Input:
    - Weekday, Hour, Minute, Date, Month
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [7]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    # temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek)
    temp['hour'] = temp['Start'].apply(lambda x: x.hour)
    temp['minute'] = temp['Start'].apply(lambda x: x.minute)
    temp['date'] = temp['Start'].apply(lambda x: x.day)
    temp['month'] = temp['Start'].apply(lambda x: x.month)
    temp = temp.drop(columns='Start')
    # data = pd.get_dummies(temp).values
    data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), scaler

In [8]:
n_steps = 5
X, y, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[ 0., 12.,  0., 12., 12.],
       [ 0., 13.,  0., 12., 12.],
       [ 0., 14.,  0., 12., 12.],
       [ 0., 15.,  0., 12., 12.],
       [ 0., 16.,  0., 12., 12.]])

In [9]:
model, history = train_model(X_train, y_train, n_steps=n_steps)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))

2023-02-24 10:55:09.899990: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()

In [11]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(400)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [12]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc_1 = get_accuracy(train_pred, y_train, 5)
train_acc_2 = get_accuracy(train_pred, y_train, 10)
train_acc_3 = get_accuracy(train_pred, y_train, 60)
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc_1 = get_accuracy(test_pred, y_test, 5)
test_acc_2 = get_accuracy(test_pred, y_test, 10)
test_acc_3 = get_accuracy(test_pred, y_test, 60)


print(f'Train Loss: {train_loss}')
print(f'Train Accuracy (abs diff within 5s): {train_acc_1}')
print(f'Train Accuracy (abs diff within 10s): {train_acc_2}')
print(f'Train Accuracy (abs diff within 60s): {train_acc_3}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy (abs diff within 5s): {test_acc_1}')
print(f'Test Accuracy (abs diff within 10s): {test_acc_2}')
print(f'Test Accuracy (abs diff within 60s): {test_acc_3}')

Train Loss: 0.010898370295763016
Train Accuracy (abs diff within 5s): 0.03066914498141264
Train Accuracy (abs diff within 10s): 0.0687732342007435
Train Accuracy (abs diff within 60s): 0.4275092936802974
Test Loss: 0.06262904405593872
Test Accuracy (abs diff within 5s): 0.007407407407407408
Test Accuracy (abs diff within 10s): 0.018518518518518517
Test Accuracy (abs diff within 60s): 0.2074074074074074


### Experiment 2: One Hot Encoded Approach
- Input:
    - **One-hot-encoded** Weekday, Hour, Minute, Date, Month
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [13]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    # temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek).astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour).astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute).astype('category')
    temp['date'] = temp['Start'].apply(lambda x: x.day).astype('category')
    temp['month'] = temp['Start'].apply(lambda x: x.month).astype('category')
    temp = temp.drop(columns='Start')
    data = pd.get_dummies(temp, columns=['weekday','hour','minute','date','month']).values
    # data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), scaler

In [14]:
n_steps = 5
X, y, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0

In [15]:
model, history = train_model(X_train, y_train, n_steps=n_steps)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()

In [16]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(400)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [17]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc_1 = get_accuracy(train_pred, y_train, 5)
train_acc_2 = get_accuracy(train_pred, y_train, 10)
train_acc_3 = get_accuracy(train_pred, y_train, 60)
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc_1 = get_accuracy(test_pred, y_test, 5)
test_acc_2 = get_accuracy(test_pred, y_test, 10)
test_acc_3 = get_accuracy(test_pred, y_test, 60)


print(f'Train Loss: {train_loss}')
print(f'Train Accuracy (abs diff within 5s): {train_acc_1}')
print(f'Train Accuracy (abs diff within 10s): {train_acc_2}')
print(f'Train Accuracy (abs diff within 60s): {train_acc_3}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy (abs diff within 5s): {test_acc_1}')
print(f'Test Accuracy (abs diff within 10s): {test_acc_2}')
print(f'Test Accuracy (abs diff within 60s): {test_acc_3}')

Train Loss: 0.0017065085703507066
Train Accuracy (abs diff within 5s): 0.08736059479553904
Train Accuracy (abs diff within 10s): 0.17193308550185873
Train Accuracy (abs diff within 60s): 0.6468401486988847
Test Loss: 0.06325159966945648
Test Accuracy (abs diff within 5s): 0.011111111111111112
Test Accuracy (abs diff within 10s): 0.03333333333333333
Test Accuracy (abs diff within 60s): 0.15925925925925927


### Experiment 3: Remove missing data
- Input:
    - One-hot-encoded Weekday, Month, Hour, Minute, Date
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [18]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek).astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour).astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute).astype('category')
    temp['date'] = temp['Start'].apply(lambda x: x.day).astype('category')
    temp['month'] = temp['Start'].apply(lambda x: x.month).astype('category')
    temp = temp.drop(columns='Start')
    data = pd.get_dummies(temp, columns=['weekday','hour','minute','date','month']).values
    # data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), temp, scaler

In [19]:
n_steps = 5
X, y, temp, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0

In [20]:
model, history = train_model(X_train, y_train, n_steps=n_steps)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
fig = go.Figure([
    go.Scatter(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.update_xaxes(
    rangebreaks=[dict(values=pd.date_range('2022-12-23', '2023-01-08'))] # hide dates with no values
)
fig.show()

In [21]:
# Bar plot
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'},
    # xaxis=dict(type = "category"),
    barmode='overlay',
    bargap=0
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
fig = go.Figure([
    go.Bar(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data', opacity=0.8),
    go.Bar(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction', opacity=0.8),
    go.Bar(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction', opacity=0.8)
], layout=layout)
fig.update_xaxes(
    rangebreaks=[dict(values=pd.date_range('2022-12-23', '2023-01-08'))] # hide dates with no values
)
fig.show()

In [22]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(400)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [23]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc_1 = get_accuracy(train_pred, y_train, 5)
train_acc_2 = get_accuracy(train_pred, y_train, 10)
train_acc_3 = get_accuracy(train_pred, y_train, 60)
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc_1 = get_accuracy(test_pred, y_test, 5)
test_acc_2 = get_accuracy(test_pred, y_test, 10)
test_acc_3 = get_accuracy(test_pred, y_test, 60)


print(f'Train Loss: {train_loss}')
print(f'Train Accuracy (abs diff within 5s): {train_acc_1}')
print(f'Train Accuracy (abs diff within 10s): {train_acc_2}')
print(f'Train Accuracy (abs diff within 60s): {train_acc_3}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy (abs diff within 5s): {test_acc_1}')
print(f'Test Accuracy (abs diff within 10s): {test_acc_2}')
print(f'Test Accuracy (abs diff within 60s): {test_acc_3}')

Train Loss: 0.004906594753265381
Train Accuracy (abs diff within 5s): 0.032552083333333336
Train Accuracy (abs diff within 10s): 0.07161458333333333
Train Accuracy (abs diff within 60s): 0.3932291666666667
Test Loss: 0.1495726853609085
Test Accuracy (abs diff within 5s): 0.0
Test Accuracy (abs diff within 10s): 0.0051813471502590676
Test Accuracy (abs diff within 60s): 0.031088082901554404


### Experiment 4: Change problem statement (from predict *current* hour's duration to *next* hour's duration)
- Input:
    - One-hot-encoded Weekday, Month, Hour, Minute, Date, **scaled current hour's duration**
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [24]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    # temp['prev_diff'] = temp['Time_diff_sec'].shift(1, fill_value=0)
    scaler = MinMaxScaler()
    scaler.fit(temp[['Time_diff_sec']])
    temp[['Time_diff_sec']] = scaler.transform(temp[['Time_diff_sec']])

    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek).astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour).astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute).astype('category')
    temp['date'] = temp['Start'].apply(lambda x: x.day).astype('category')
    temp['month'] = temp['Start'].apply(lambda x: x.month).astype('category')
    temp = temp.drop(columns='Start')
    data = pd.get_dummies(temp, columns=['weekday','hour','minute','date','month']).values
    # data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, :], data[i+n_steps, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y), scaler

In [25]:
n_steps = 5
X, y, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train.shape

(768, 5, 66)

In [26]:
model, history = train_model(X_train, y_train, n_steps=n_steps)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
fig = go.Figure([
    go.Scatter(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.update_xaxes(
    rangebreaks=[dict(values=pd.date_range('2022-12-23', '2023-01-08'))] # hide dates with no values
)
fig.show()

In [27]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc_1 = get_accuracy(train_pred, y_train, 5)
train_acc_2 = get_accuracy(train_pred, y_train, 10)
train_acc_3 = get_accuracy(train_pred, y_train, 60)
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc_1 = get_accuracy(test_pred, y_test, 5)
test_acc_2 = get_accuracy(test_pred, y_test, 10)
test_acc_3 = get_accuracy(test_pred, y_test, 60)


print(f'Train Loss: {train_loss}')
print(f'Train Accuracy (abs diff within 5s): {train_acc_1}')
print(f'Train Accuracy (abs diff within 10s): {train_acc_2}')
print(f'Train Accuracy (abs diff within 60s): {train_acc_3}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy (abs diff within 5s): {test_acc_1}')
print(f'Test Accuracy (abs diff within 10s): {test_acc_2}')
print(f'Test Accuracy (abs diff within 60s): {test_acc_3}')

Train Loss: 0.0037476203870028257
Train Accuracy (abs diff within 5s): 0.06640625
Train Accuracy (abs diff within 10s): 0.125
Train Accuracy (abs diff within 60s): 0.515625
Test Loss: 0.16831795871257782
Test Accuracy (abs diff within 5s): 0.010362694300518135
Test Accuracy (abs diff within 10s): 0.025906735751295335
Test Accuracy (abs diff within 60s): 0.15544041450777202


### Experiment 5: Different activation function (sigmoid) and more features
- Input:
    - One-hot-encoded Weekday, Month, Hour, Minute, Date, **is_weekend(binary), is_winter_holiday(binary), scaled current hour's duration**
- Activation Function:
    - LSTM: tanh
    - Dense: **sigmoid**
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [28]:
def train_model_sig(X_train, y_train, num_epochs = 400, n_steps=1):
    feature_shape = X_train.shape[2]

    model = keras.Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
    # model.add(Dropout(0.2))

    # model.add(LSTM(32, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16))
    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(1)))
    model.add(Dense(32))
    model.add(Dense(16))
    model.add(Dense(1, activation='sigmoid'))
    # model.add(Lambda(get_range))
    opt = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=opt, loss='mse')

    history = model.fit(X_train, y_train,epochs=num_epochs, verbose=0)
    return model, history

def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    # temp['prev_diff'] = temp['Time_diff_sec'].shift(1, fill_value=0)
    scaler = MinMaxScaler()
    scaler.fit(temp[['Time_diff_sec']])
    temp[['Time_diff_sec']] = scaler.transform(temp[['Time_diff_sec']])

    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek).astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour).astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute).astype('category')
    temp['date'] = temp['Start'].apply(lambda x: x.day).astype('category')
    temp['month'] = temp['Start'].apply(lambda x: x.month).astype('category')
    temp['is_weekend'] = temp['Start'].apply(lambda x: 1 if x == 5 or x == 6 else -1)
    temp['is_winter_holiday'] = temp['Start'].apply(lambda x: 1 if x > datetime.datetime(2022, 12, 12) or x < datetime.datetime(2023, 1, 8) else -1)

    # cal = USFederalHolidayCalendar()
    # holidays = cal.holidays(start=temp.iloc[0,0], end=temp.iloc[-1, 0])
    # temp['is_public_holiday'] = temp['Start'].apply(
    #     lambda x: 1 if x.floor('d') in holidays else -1
    #     )
    temp = temp.drop(columns='Start')
    data = pd.get_dummies(temp, columns=['weekday','hour','minute','date','month']).values
    # data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, :], data[i+n_steps, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y), scaler

In [29]:
n_steps = 5
X, y, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train.shape

(768, 5, 68)

In [30]:
model, history = train_model_sig(X_train, y_train, n_steps=n_steps)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
fig = go.Figure([
    go.Scatter(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.update_xaxes(
    rangebreaks=[dict(values=pd.date_range('2022-12-23', '2023-01-08'))] # hide dates with no values
)
fig.show()

In [35]:
# Bar plot
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'},
    # xaxis=dict(type = "category"),
    barmode='overlay',
    bargap=0
)
fig = go.Figure([
    go.Bar(x=temp['Start'].iloc[n_steps:], y=temp['Time_diff_sec'].iloc[n_steps:].values, name='data', opacity=0.8),
    go.Bar(x=temp['Start'].iloc[n_steps:train_size+n_steps], y=train_pred[:,0], name='train prediction', opacity=0.8),
    go.Bar(x=temp['Start'].iloc[train_size+n_steps:], y=test_pred[:,0], name='test prediction', opacity=0.8)
], layout=layout)
fig.update_xaxes(
    rangebreaks=[dict(values=pd.date_range('2022-12-23', '2023-01-08'))] # hide dates with no values
)
fig.show()

In [32]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(400)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [33]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc_1 = get_accuracy(train_pred, y_train, 5)
train_acc_2 = get_accuracy(train_pred, y_train, 10)
train_acc_3 = get_accuracy(train_pred, y_train, 60)
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc_1 = get_accuracy(test_pred, y_test, 5)
test_acc_2 = get_accuracy(test_pred, y_test, 10)
test_acc_3 = get_accuracy(test_pred, y_test, 60)


print(f'Train Loss: {train_loss}')
print(f'Train Accuracy (abs diff within 5s): {train_acc_1}')
print(f'Train Accuracy (abs diff within 10s): {train_acc_2}')
print(f'Train Accuracy (abs diff within 60s): {train_acc_3}')
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy (abs diff within 5s): {test_acc_1}')
print(f'Test Accuracy (abs diff within 10s): {test_acc_2}')
print(f'Test Accuracy (abs diff within 60s): {test_acc_3}')

Train Loss: 0.003875887719914317
Train Accuracy (abs diff within 5s): 0.84765625
Train Accuracy (abs diff within 10s): 0.8515625
Train Accuracy (abs diff within 60s): 0.8671875
Test Loss: 0.13839326798915863
Test Accuracy (abs diff within 5s): 0.7979274611398963
Test Accuracy (abs diff within 10s): 0.8031088082901554
Test Accuracy (abs diff within 60s): 0.8290155440414507
