In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go

## Task 1: Predict time used in seconds per hour

### Data Preperation

In [2]:
df = pd.read_csv("../data/processed/lstm_dataset_local.csv", parse_dates=['Start', 'End'])
df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
df = df.drop(columns='Duration')
delta = datetime.timedelta(hours=1)
df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

In [3]:
def helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + helper(row2)

def func(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(helper(row))
    return pd.DataFrame([row])

In [4]:
processed_df = pd.concat([func(row) for _, row in df.iterrows()], ignore_index=True)

df_1 = processed_df[processed_df['Value'] == 'firefox.exe'].reset_index()
df_1 = df_1[['Start', 'Time_diff_sec']]
df_1 = df_1.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum()
fig = go.Figure([go.Scatter(x=temp.index, y=temp.values)], layout=layout)
fig.show()

In [5]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    # temp = temp[(temp['Start'] < '2022-12-23') | (temp['Start'] > '2023-01-08')]
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek)#.astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour)#.astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute)
    temp['date'] = temp['Start'].apply(lambda x: x.day)
    temp['month'] = temp['Start'].apply(lambda x: x.month)
    temp = temp.drop(columns='Start')
#     temp = pd.get_dummies(temp).values
    data = temp.values
    
    X, y = [], []
    for i in range(len(data)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:i+n_steps, 1:], data[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), temp, scaler

In [6]:
n_steps = 5
X, y, temp, scaler = get_dataset(df_1, n_steps)

train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

X_train[0]

array([[ 0., 12.,  0., 12., 12.],
       [ 0., 13.,  0., 12., 12.],
       [ 0., 14.,  0., 12., 12.],
       [ 0., 15.,  0., 12., 12.],
       [ 0., 16.,  0., 12., 12.]])

### Model

In [7]:
def train_model(X_train, num_epochs = 100, n_steps=5):
    feature_shape = X_train.shape[2]

    model = keras.Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
    # model.add(Dropout(0.2))

    # model.add(LSTM(32, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16, return_sequences=True))
    # model.add(Dropout(0.2))

    model.add(LSTM(16))
    # model.add(Dropout(0.2))

    # model.add(TimeDistributed(Dense(1)))
    model.add(Dense(32))
    model.add(Dense(16))
    model.add(Dense(1))
    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss='mse')

    history = model.fit(X_train, y_train,epochs=num_epochs, verbose=0)
    return model, history

### Experiment 1:
- Input:
    - Weekday, Month, Hour, Minute, Date
- Activation Function:
    - LSTM: tanh
    - Dense: None(linear)
- Loss: 
    - Mean Squared Error
- Optimizer: 
    - Adam
- Learning_rate: 
    - 0.001
- Num_epoch: 
    - 100

In [8]:
model_1, history_1 = train_model(X_train)
train_pred = scaler.inverse_transform(model_1.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model_1.predict(X_test, verbose=0))
layout = go.Layout(
    title=go.layout.Title(text='Firefox used in seconds per hour<br><sup>Input: weekday, month, hour, minute, date, MinMaxScaler</sup>'),
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'], y=temp['Time_diff_sec'].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[:train_size], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2023-02-16 18:18:10.982730: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-16 18:18:10.983408: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-02-16 18:18:12.092513: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-02-16 18:18:13.796278: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-16 18:18:14.412021: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-16 18:18:14.810536: I tensorflow/core/grappler/o

In [9]:
loss = history_1.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(100)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [10]:
model_1.evaluate(X_test, y_test)

2023-02-16 18:19:12.312989: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-16 18:19:12.535955: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-16 18:19:12.607866: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-16 18:19:12.686473: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




0.04316813498735428

## Task 2: Next app duration prediction

In [11]:
df = pd.read_csv("/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv")
df['diff_second'] = df['Duration'].apply(lambda x: pd.Timedelta(x).total_seconds())
scaler = MinMaxScaler(feature_range=(0, 1))
df['diff_second_scaled'] = scaler.fit_transform(df[['diff_second']])
df['weekday'] = df['Start'].apply(lambda x: pd.to_datetime(x).dayofweek).astype('category')
df['hour'] = df['Start'].apply(lambda x: pd.to_datetime(x).hour).astype('category')
df['exe'] = df['Value'].astype('category')
# df['target'] = df['diff_second'].shift(-1)
df = df.drop(columns=['Start', 'Value','Duration'])
# df = df.drop(len(df)-1)
X = df[['diff_second_scaled', 'weekday', 'hour', 'exe']]
X = pd.get_dummies(X).to_numpy()
y = df[['diff_second']].values

FileNotFoundError: [Errno 2] No such file or directory: '/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv'

In [None]:
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[:train_size, :], X[train_size:, :]
y_train, y_test = y[:train_size,:], y[train_size:, :]

In [None]:
lookback = 10
batch_size = 10

train_generator = TimeseriesGenerator(X_train, y_train, length=lookback, batch_size=batch_size)
test_generator = TimeseriesGenerator(X_test, y_test, length=lookback, batch_size=1)

### Model

In [None]:
feature_shape = X.shape[1]

model = keras.Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(lookback, feature_shape)))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

num_epochs = 25
model.fit_generator(train_generator, epochs=num_epochs, verbose=1)