In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go

2023-02-16 18:34:21.560755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Task 1: Predict time used in seconds per hour

### Data and Model Preperation

In [2]:
df = pd.read_csv("../data/processed/lstm_dataset_local.csv", parse_dates=['Start', 'End'])
df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
df = df.drop(columns='Duration')
delta = datetime.timedelta(hours=1)
df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

In [3]:
def helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + helper(row2)

def func(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(helper(row))
    return pd.DataFrame([row])

In [4]:
processed_df = pd.concat([func(row) for _, row in df.iterrows()], ignore_index=True)

df_1 = processed_df[processed_df['Value'] == 'firefox.exe'].reset_index()
df_1 = df_1[['Start', 'Time_diff_sec']]
df_1 = df_1.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum()
fig = go.Figure([go.Scatter(x=temp.index, y=temp.values)], layout=layout)
fig.show()

### Experiment 1:
- Input:
    - Weekday, Month, Hour, Minute, Date
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [5]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    # temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek)#.astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour)#.astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute)
    temp['date'] = temp['Start'].apply(lambda x: x.day)
    temp['month'] = temp['Start'].apply(lambda x: x.month)
    temp = temp.drop(columns='Start')
#     temp = pd.get_dummies(temp).values
    data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), temp, scaler

def train_model(X_train, y_train, num_epochs = 100, n_steps=5):
    feature_shape = X_train.shape[2]

    model = keras.Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
    # model.add(Dropout(0.2))

    # model.add(LSTM(32, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16))
    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(1)))
    model.add(Dense(32))
    model.add(Dense(16))
    model.add(Dense(1))
    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss='mse')

    history = model.fit(X_train, y_train,epochs=num_epochs, verbose=0)
    return model, history

In [6]:
n_steps = 5
X, y, temp, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[ 0., 12.,  0., 12., 12.],
       [ 0., 13.,  0., 12., 12.],
       [ 0., 14.,  0., 12., 12.],
       [ 0., 15.,  0., 12., 12.],
       [ 0., 16.,  0., 12., 12.]])

In [7]:
model, history = train_model(X_train, y_train)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'], y=temp['Time_diff_sec'].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[:train_size], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()

2023-02-16 18:34:30.762499: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(100)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [9]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
test_loss = model.evaluate(X_test, y_test, verbose=0)

print(f'Train Loss: {train_loss}')
print(f'Test Loss: {test_loss}')

Train Loss: 0.008156168274581432
Test Loss: 0.03122629038989544


### Experiment 2: Experiment 1 without month

In [10]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    # temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek)#.astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour)#.astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute)
    temp['date'] = temp['Start'].apply(lambda x: x.day)
    # temp['month'] = temp['Start'].apply(lambda x: x.month)
    temp = temp.drop(columns='Start')
#     temp = pd.get_dummies(temp).values
    data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), temp, scaler

def train_model(X_train, y_train, num_epochs = 100, n_steps=5):
    feature_shape = X_train.shape[2]

    model = keras.Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
    # model.add(Dropout(0.2))

    # model.add(LSTM(32, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16))
    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(1)))
    model.add(Dense(32))
    model.add(Dense(16))
    model.add(Dense(1))
    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss='mse')

    history = model.fit(X_train, y_train,epochs=num_epochs, verbose=0)
    return model, history

In [11]:
n_steps = 5
X, y, temp, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[ 0., 12.,  0., 12.],
       [ 0., 13.,  0., 12.],
       [ 0., 14.,  0., 12.],
       [ 0., 15.,  0., 12.],
       [ 0., 16.,  0., 12.]])

In [12]:
model, history = train_model(X_train, y_train)
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'], y=temp['Time_diff_sec'].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[:train_size], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()

In [13]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(100)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [14]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
test_loss = model.evaluate(X_test, y_test, verbose=0)

print(f'Train Loss: {train_loss}')
print(f'Test Loss: {test_loss}')

Train Loss: 0.012885477393865585
Test Loss: 0.034654200077056885
