In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go

2023-02-12 15:14:13.442654: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Task 1: Predict time used in seconds per hour

### Data Preperation

In [2]:
df = pd.read_csv("/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv", parse_dates=['Start', 'End'])
df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
df = df.drop(columns='Duration')
delta = datetime.timedelta(hours=1)
df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

In [3]:
df.head(10)

Unnamed: 0,Start,Value,End,Time_diff_sec,sec_to_next_hr
0,2022-12-12 12:34:09.893,VsDebugConsole.exe,2022-12-12 12:34:12.896,3.003,1550
1,2022-12-12 12:34:12.896,firefox.exe,2022-12-12 12:52:55.024,1122.128,1547
2,2022-12-12 12:52:55.024,VsDebugConsole.exe,2022-12-12 17:06:02.811,15187.787,424
3,2022-12-12 17:06:02.811,explorer.exe,2022-12-12 17:06:14.814,12.003,3237
4,2022-12-12 17:06:14.814,VsDebugConsole.exe,2022-12-12 17:06:56.818,42.004,3225
5,2022-12-12 17:06:56.818,firefox.exe,2022-12-12 17:06:59.819,3.001,3183
6,2022-12-12 17:06:59.819,Teams.exe,2022-12-12 17:07:02.819,3.0,3180
7,2022-12-12 17:07:02.819,explorer.exe,2022-12-12 17:07:32.823,30.004,3177
8,2022-12-12 17:07:32.823,SearchApp.exe,2022-12-12 17:07:35.824,3.001,3147
9,2022-12-12 17:07:35.824,explorer.exe,2022-12-12 17:07:38.824,3.0,3144


In [4]:
def helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + helper(row2)

def func(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(helper(row))
    return pd.DataFrame([row])

In [5]:
processed_df = pd.concat([func(row) for _, row in df.iterrows()], ignore_index=True)

processed_df.head(10)

Unnamed: 0,Start,Value,End,Time_diff_sec,sec_to_next_hr
0,2022-12-12 12:34:09.893,VsDebugConsole.exe,2022-12-12 12:34:12.896,3.003,1550
1,2022-12-12 12:34:12.896,firefox.exe,2022-12-12 12:52:55.024,1122.128,1547
2,2022-12-12 12:52:55.024,VsDebugConsole.exe,2022-12-12 13:00:00.000,424.0,424
3,2022-12-12 13:00:00.000,VsDebugConsole.exe,2022-12-12 14:00:00.000,3600.0,3600
4,2022-12-12 14:00:00.000,VsDebugConsole.exe,2022-12-12 15:00:00.000,3600.0,3600
5,2022-12-12 15:00:00.000,VsDebugConsole.exe,2022-12-12 16:00:00.000,3600.0,3600
6,2022-12-12 16:00:00.000,VsDebugConsole.exe,2022-12-12 17:00:00.000,3600.0,3600
7,2022-12-12 17:00:00.000,VsDebugConsole.exe,2022-12-12 17:06:03.787,363.787,3600
8,2022-12-12 17:06:02.811,explorer.exe,2022-12-12 17:06:14.814,12.003,3237
9,2022-12-12 17:06:14.814,VsDebugConsole.exe,2022-12-12 17:06:56.818,42.004,3225


In [6]:
df_1 = processed_df[processed_df['Value'] == 'firefox.exe'].reset_index()
df_1 = df_1[['Start', 'Time_diff_sec']]
df_1.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum()
fig = go.Figure([go.Scatter(x=temp.index, y=temp.values)], layout=layout)
fig.show()

In [8]:
def get_dataset(df, n_steps):
    temp = df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek)#.astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour)#.astype('category')
    temp['minute'] = temp['Start'].apply(lambda x: x.minute)
    temp['date'] = temp['Start'].apply(lambda x: x.day)
    temp['month'] = temp['Start'].apply(lambda x: x.month)
    temp = temp.drop(columns='Start')
#     temp = pd.get_dummies(temp).values
    temp = temp.values
    
    X, y = [], []
    for i in range(len(temp)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = temp[i:i+n_steps, 1:], temp[i+n_steps-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), scaler

In [9]:
n_steps = 5
X, y, scaler = get_dataset(df_1, n_steps)

In [10]:
train_size = int(X.shape[0] * 0.8)
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

In [11]:
X_train.shape

(602, 5, 5)

### Model

In [12]:
feature_shape = X_train.shape[2]

model = keras.Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(n_steps, feature_shape)))
# model.add(Dropout(0.2))

model.add(LSTM(32, return_sequences=True))
# model.add(Dropout(0.2))

model.add(LSTM(16, return_sequences=True))
# model.add(Dropout(0.2))

model.add(LSTM(16))
# model.add(Dropout(0.2))

# model.add(TimeDistributed(Dense(1)))
model.add(Dense(32))
model.add(Dense(16))
model.add(Dense(1))
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mse')

num_epochs = 100
history = model.fit(X_train, y_train,epochs=num_epochs, verbose=2)

2023-02-12 15:15:23.481252: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
19/19 - 6s - loss: 0.0221 - 6s/epoch - 321ms/step
Epoch 2/100
19/19 - 0s - loss: 0.0205 - 150ms/epoch - 8ms/step
Epoch 3/100
19/19 - 0s - loss: 0.0209 - 194ms/epoch - 10ms/step
Epoch 4/100
19/19 - 0s - loss: 0.0208 - 155ms/epoch - 8ms/step
Epoch 5/100
19/19 - 0s - loss: 0.0199 - 140ms/epoch - 7ms/step
Epoch 6/100
19/19 - 0s - loss: 0.0197 - 145ms/epoch - 8ms/step
Epoch 7/100
19/19 - 0s - loss: 0.0201 - 152ms/epoch - 8ms/step
Epoch 8/100
19/19 - 0s - loss: 0.0200 - 192ms/epoch - 10ms/step
Epoch 9/100
19/19 - 0s - loss: 0.0198 - 155ms/epoch - 8ms/step
Epoch 10/100
19/19 - 0s - loss: 0.0198 - 145ms/epoch - 8ms/step
Epoch 11/100
19/19 - 0s - loss: 0.0190 - 139ms/epoch - 7ms/step
Epoch 12/100
19/19 - 0s - loss: 0.0192 - 140ms/epoch - 7ms/step
Epoch 13/100
19/19 - 0s - loss: 0.0190 - 181ms/epoch - 10ms/step
Epoch 14/100
19/19 - 0s - loss: 0.0195 - 180ms/epoch - 9ms/step
Epoch 15/100
19/19 - 0s - loss: 0.0185 - 155ms/epoch - 8ms/step
Epoch 16/100
19/19 - 0s - loss: 0.0184 - 151ms/

In [13]:
train_pred = scaler.inverse_transform(model.predict(X_train, verbose=0))
test_pred = scaler.inverse_transform(model.predict(X_test, verbose=0))

In [15]:
layout = go.Layout(
    title='Firefox used in seconds per hour',
#     xaxis={'title':'Date'},
    yaxis={'title':'Duration(s)'}
)
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum().reset_index()
fig = go.Figure([
    go.Scatter(x=temp['Start'], y=temp['Time_diff_sec'].values, name='data'),
    go.Scatter(x=temp['Start'].iloc[:train_size], y=train_pred[:,0], name='train prediction'),
    go.Scatter(x=temp['Start'].iloc[train_size:], y=test_pred[:,0], name='test prediction')
], layout=layout)
fig.show()
# import plotly.io as pio
# pio.write_image(fig, '../../../experiments/test1.png', width=985, height=525)

In [16]:
loss = history.history['loss']
layout = go.Layout(
    xaxis={'title':'Epoch'},
    yaxis={'title':'Loss'}
)
fig = go.Figure([
    go.Scatter(x=list(range(num_epochs)), y=loss, mode='lines',name = 'Training Loss')
], layout=layout)
fig.show()

In [17]:
model.evaluate(X_test, y_test)



0.03669638931751251

## Task 2: Next app duration prediction

In [86]:
df = pd.read_csv("/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv")
df['diff_second'] = df['Duration'].apply(lambda x: pd.Timedelta(x).total_seconds())
scaler = MinMaxScaler(feature_range=(0, 1))
df['diff_second_scaled'] = scaler.fit_transform(df[['diff_second']])
df['weekday'] = df['Start'].apply(lambda x: pd.to_datetime(x).dayofweek).astype('category')
df['hour'] = df['Start'].apply(lambda x: pd.to_datetime(x).hour).astype('category')
df['exe'] = df['Value'].astype('category')
# df['target'] = df['diff_second'].shift(-1)
df = df.drop(columns=['Start', 'Value','Duration'])
# df = df.drop(len(df)-1)
X = df[['diff_second_scaled', 'weekday', 'hour', 'exe']]
X = pd.get_dummies(X).to_numpy()
y = df[['diff_second']].values

In [87]:
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[:train_size, :], X[train_size:, :]
y_train, y_test = y[:train_size,:], y[train_size:, :]

In [88]:
lookback = 10
batch_size = 10

train_generator = TimeseriesGenerator(X_train, y_train, length=lookback, batch_size=batch_size)
test_generator = TimeseriesGenerator(X_test, y_test, length=lookback, batch_size=1)

### Model

In [90]:
feature_shape = X.shape[1]

model = keras.Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(lookback, feature_shape)))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

num_epochs = 25
model.fit_generator(train_generator, epochs=num_epochs, verbose=1)

Epoch 1/25


  model.fit_generator(train_generator, epochs=num_epochs, verbose=1)


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fbb64954a30>