In [114]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
import plotly.graph_objects as go

## Task 1: Predict time used in seconds per hour

### Data Preperation

In [92]:
df = pd.read_csv("/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv", parse_dates=['Start', 'End'])
df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
# df['Time_diff_min'] = df['Time_diff_sec'] / 60

In [93]:
delta = datetime.timedelta(hours=1)
df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

In [103]:
def helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + helper(row2)

def func(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(helper(row))
    return pd.DataFrame([row])

In [105]:
df = pd.concat([func(row) for _, row in df.iterrows()], ignore_index=True)
df

Unnamed: 0,Start,Value,End,Duration,Time_diff_sec,sec_to_next_hr
0,2022-12-12 12:34:09.893,VsDebugConsole.exe,2022-12-12 12:34:12.896,0 days 00:00:03.003000,3.003,1550
1,2022-12-12 12:34:12.896,firefox.exe,2022-12-12 12:52:55.024,0 days 00:18:42.128000,1122.128,1547
2,2022-12-12 12:52:55.024,VsDebugConsole.exe,2022-12-12 13:00:00.000,0 days 04:13:07.787000,424.000,424
3,2022-12-12 13:00:00.000,VsDebugConsole.exe,2022-12-12 14:00:00.000,0 days 04:13:07.787000,3600.000,3600
4,2022-12-12 14:00:00.000,VsDebugConsole.exe,2022-12-12 15:00:00.000,0 days 04:13:07.787000,3600.000,3600
...,...,...,...,...,...,...
4979,2023-01-13 01:29:11.877,LeagueClientUx.exe,2023-01-13 01:52:43.174,0 days 00:23:31.297000,1411.297,1848
4980,2023-01-13 01:52:43.174,firefox.exe,2023-01-13 02:00:00.000,0 days 00:31:02.745000,436.000,436
4981,2023-01-13 02:00:00.000,firefox.exe,2023-01-13 02:23:46.745,0 days 00:31:02.745000,1426.745,3600
4982,2023-01-13 02:23:45.919,LeagueClientUx.exe,2023-01-13 02:23:49.144,0 days 00:00:03.225000,3.225,2174


In [158]:
df_1 = df[df['Value'] == 'firefox.exe']
df_1

Unnamed: 0,Start,Value,End,Duration,Time_diff_sec,sec_to_next_hr
1,2022-12-12 12:34:12.896,firefox.exe,2022-12-12 12:52:55.024,0 days 00:18:42.128000,1122.128,1547
10,2022-12-12 17:06:56.818,firefox.exe,2022-12-12 17:06:59.819,0 days 00:00:03.001000,3.001,3183
15,2022-12-12 17:07:38.824,firefox.exe,2022-12-12 17:07:47.825,0 days 00:00:09.001000,9.001,3141
17,2022-12-12 17:07:59.827,firefox.exe,2022-12-12 17:08:05.828,0 days 00:00:06.001000,6.001,3120
20,2022-12-12 17:08:38.832,firefox.exe,2022-12-12 17:08:56.834,0 days 00:00:18.002000,18.002,3081
...,...,...,...,...,...,...
4975,2023-01-13 01:14:24.240,firefox.exe,2023-01-13 01:14:31.843,0 days 00:00:07.603000,7.603,2735
4977,2023-01-13 01:20:11.730,firefox.exe,2023-01-13 01:20:13.327,0 days 00:00:01.597000,1.597,2388
4980,2023-01-13 01:52:43.174,firefox.exe,2023-01-13 02:00:00.000,0 days 00:31:02.745000,436.000,436
4981,2023-01-13 02:00:00.000,firefox.exe,2023-01-13 02:23:46.745,0 days 00:31:02.745000,1426.745,3600


In [159]:
temp = df_1.groupby(pd.Grouper(key='Start', freq='H'))['Time_diff_sec'].sum()
fig = go.Figure([go.Scatter(x=temp.index, y=temp.values)])
fig.show()

In [172]:
def get_dataset(df, n_steps):
    temp = df[['Start', 'Time_diff_sec']].copy()
    temp['weekday'] = temp['Start'].apply(lambda x: x.dayofweek).astype('category')
    temp['hour'] = temp['Start'].apply(lambda x: x.hour).astype('category')
    temp = temp.drop(columns='Start')
    temp = pd.get_dummies(temp).values
    
    X, y = [], []
    for i in range(len(temp)-n_steps-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = temp[i:i+n_steps, 1:], temp[i+n_steps-1, 0]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [180]:
n_steps = 5
X, y = get_dataset(df_1, n_steps)

In [185]:
a, b = get_dataset(df_1, 1)

In [181]:
y[0]

18.002

In [182]:
train_size = int(X.shape[0] * 0.8)
test_size = X.shape[1] - train_size
X_train, X_test = X[:train_size, :, :], X[train_size:, :, :]
y_train, y_test = y[:train_size], y[train_size:]

In [183]:
X_train.shape

(1025, 5, 29)

### Model

In [189]:
feature_shape = X_train.shape[2]

model = keras.Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(n_steps, feature_shape)))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

# model.add(LSTM(128))
# model.add(Dropout(0.2))

# model.add(TimeDistributed(Dense(1)))
model.add(Dense(1))
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss='mse')

num_epochs = 100
model.fit(X_train, y_train, epochs=num_epochs, batch_size=10, verbose=2)

Epoch 1/100
103/103 - 4s - loss: 169834.0781 - 4s/epoch - 43ms/step
Epoch 2/100
103/103 - 1s - loss: 166941.8906 - 687ms/epoch - 7ms/step
Epoch 3/100
103/103 - 1s - loss: 166638.1094 - 690ms/epoch - 7ms/step
Epoch 4/100
103/103 - 1s - loss: 166612.6719 - 708ms/epoch - 7ms/step
Epoch 5/100
103/103 - 1s - loss: 166600.9375 - 727ms/epoch - 7ms/step
Epoch 6/100
103/103 - 1s - loss: 166636.6250 - 725ms/epoch - 7ms/step
Epoch 7/100


KeyboardInterrupt: 

In [130]:
result = model.predict(X_test, verbose=0)

ValueError: in user code:

    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/engine/training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/engine/training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/engine/training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/engine/training.py", line 1983, in predict_step
        return self(x, training=False)
    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_8" is incompatible with the layer: expected shape=(None, 3987, 30), found shape=(None, 997, 30)


## Task 2: Next app duration prediction

In [86]:
df = pd.read_csv("/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv")
df['diff_second'] = df['Duration'].apply(lambda x: pd.Timedelta(x).total_seconds())
scaler = MinMaxScaler(feature_range=(0, 1))
df['diff_second_scaled'] = scaler.fit_transform(df[['diff_second']])
df['weekday'] = df['Start'].apply(lambda x: pd.to_datetime(x).dayofweek).astype('category')
df['hour'] = df['Start'].apply(lambda x: pd.to_datetime(x).hour).astype('category')
df['exe'] = df['Value'].astype('category')
# df['target'] = df['diff_second'].shift(-1)
df = df.drop(columns=['Start', 'Value','Duration'])
# df = df.drop(len(df)-1)
X = df[['diff_second_scaled', 'weekday', 'hour', 'exe']]
X = pd.get_dummies(X).to_numpy()
y = df[['diff_second']].values

In [87]:
train_size = int(len(X) * 0.8)
test_size = len(X) - train_size
X_train, X_test = X[:train_size, :], X[train_size:, :]
y_train, y_test = y[:train_size,:], y[train_size:, :]

In [88]:
lookback = 10
batch_size = 10

train_generator = TimeseriesGenerator(X_train, y_train, length=lookback, batch_size=batch_size)
test_generator = TimeseriesGenerator(X_test, y_test, length=lookback, batch_size=1)

### Model

In [90]:
feature_shape = X.shape[1]

model = keras.Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(lookback, feature_shape)))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

num_epochs = 25
model.fit_generator(train_generator, epochs=num_epochs, verbose=1)

Epoch 1/25


  model.fit_generator(train_generator, epochs=num_epochs, verbose=1)


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fbb64954a30>