In [2]:
import pandas as pd
import os
import json
from tensorflow import keras
from keras.layers import LSTM, Dense, Dropout, TimeDistributed
import plotly.graph_objects as go
import pandas as pd
import os
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

2023-02-16 17:30:55.706449: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def row_helper(row):
    delta = datetime.timedelta(hours=1)
    if row['Time_diff_sec'] < row['sec_to_next_hr']:
        row['End'] = row['Start'] + pd.to_timedelta(row['Time_diff_sec'], unit='S')
        return [row]
    row2 = row.copy()
    
    row['End'] = (row['Start']+delta).floor('H')
    row2['Start'] = row['End']
    row2['End'] = (row['End']+delta).floor('H')
    
    row2['Time_diff_sec'] = row['Time_diff_sec'] - row['sec_to_next_hr']
    row2['sec_to_next_hr'] = 3600
    row['Time_diff_sec'] = row['sec_to_next_hr']
    return [row] + row_helper(row2)

def clean_row(row):
    if row['Time_diff_sec'] > row['sec_to_next_hr']:
        return pd.DataFrame(row_helper(row))
    return pd.DataFrame([row])

def clean_dataset(file_path):
    df = pd.read_csv(file_path, parse_dates=['Start', 'End'])
    df['Duration'] = df['Duration'].apply(lambda x: pd.Timedelta(x))
    df['Time_diff_sec'] = df['Duration'].apply(lambda x: x.total_seconds())
    df = df.drop(columns='Duration')
    delta = datetime.timedelta(hours=1)
    df['sec_to_next_hr'] = df['Start'].apply(lambda x: ((x+delta).replace(microsecond=0, second=0, minute=0) - x).seconds)

    return pd.concat([clean_row(row) for _, row in df.iterrows()], ignore_index=True)

def get_dataset(df, lookback):
    df['weekday'] = df['Start'].apply(lambda x: x.dayofweek)#.astype('category')
    df['hour'] = df['Start'].apply(lambda x: x.hour)#.astype('category')
    df['minute'] = df['Start'].apply(lambda x: x.minute)
    df['date'] = df['Start'].apply(lambda x: x.day)
    df['month'] = df['Start'].apply(lambda x: x.month)
    df = df.drop(columns='Start')
#     df = pd.get_dummies(df).values
    df = df.values
    
    X, y = [], []
    for i in range(len(df)-lookback-1):
        # gather input and output parts of the pattern
        seq_x, seq_y = df[i:i+lookback, 1:], df[i+lookback-1, 0:1]
        X.append(seq_x)
        y.append(seq_y)
    scaler = MinMaxScaler()
    scaler.fit(y)
    y = scaler.transform(y)
    return np.array(X), np.array(y), scaler

In [4]:
class LSTM_2:
    def __init__(self, args) -> None:
        self.args = args
#         experiment = args['experiment']
#         self.dir_path = f'../../../outputs/LSTM_{experiment}'
#         if not os.path.exists(self.dir_path):
#             os.mkdir(self.dir_path)
#         with open(f'{self.dir_path}/config.json', 'w') as file:
#             json.dump(args, file)
    
    def train(self):
        file_path = "/Users/yikaimao/Desktop/DSC_180B/Intel-capstone/data/processed/lstm_dataset_local.csv"
        print('Processing dataset...')
        df = clean_dataset(file_path)
        self.input_df = df[df['Value'] == self.args['exe_name']].reset_index()
        self.input_df = self.input_df.groupby(pd.Grouper(key='Start', freq='H')).sum().reset_index()
        X, y, self.scaler = get_dataset(self.input_df, self.args['lookback'])

        self.train_size = int(X.shape[0] * 0.8)
        self.X_train, self.X_test = X[:self.train_size, :, :], X[self.train_size:, :, :]
        self.y_train, self.y_test = y[:self.train_size], y[self.train_size:]

        feature_shape = self.X_train.shape[2]

        self.model = keras.Sequential()
        self.model.add(LSTM(32, return_sequences=True, input_shape=(self.args['lookback'], feature_shape)))
        # self.model.add(Dropout(0.2))

        self.model.add(LSTM(32, return_sequences=True))
        # self.model.add(Dropout(0.2))

        self.model.add(LSTM(16, return_sequences=True))
        # self.model.add(Dropout(0.2))

        self.model.add(LSTM(16))
        # self.model.add(Dropout(0.2))

        # self.model.add(TimeDistributed(Dense(1)))
        self.model.add(Dense(32))
        self.model.add(Dense(16))
        self.model.add(Dense(1))
        opt = keras.optimizers.Adam(learning_rate=self.args['learning_rate'])
        self.model.compile(optimizer=opt, loss=self.args['loss'])

        print('Training model...')
        self.history = self.model.fit(self.X_train, self.y_train, epochs=self.args['epochs'], verbose=2)
        print('Finished training.')

        train_loss = self.model.evaluate(self.X_train, self.y_train)
        print(f'Total loss: {train_loss}')

        print('Plotting the loss over epoch')
        loss = self.history.history['loss']
        loss_layout = go.Layout(
            title='Loss plot',
            xaxis={'title':'Epoch'},
            yaxis={'title':'Loss'}
        )
        loss_fig = go.Figure([
            go.Scatter(x=list(range(self.args['epochs'])), y=loss, mode='lines',name = 'training loss')
        ], layout=loss_layout)
        loss_fig.show()

#         print('Saving everything...')
#         with open(f'{self.dir_path}/train_history.json', 'w') as file:
#             json.dump(self.history.history, file)
#         self.model.save(f'{self.dir_path}/model.h5')
#         pio.write_image(loss_fig, f'{self.dir_path}/loss.png', width=985, height=525)
#         print(f'Model, model history, loss plot saved at {self.dir_path}')

    def evaluate(self):
        test_loss = self.model.evaluate(self.X_test, self.y_test)
        print(f'Test loss: {test_loss}')

        print('Plotting the prediction and ground truth')
        train_pred = self.scaler.inverse_transform(self.model.predict(self.X_train, verbose=0))
        test_pred = self.scaler.inverse_transform(self.model.predict(self.X_test, verbose=0))

        pred_layout = go.Layout(
            title='Firefox used in seconds per hour',
        #     xaxis={'title':'Date'},
            yaxis={'title':'Duration(s)'}
        )
        pred_fig = go.Figure([
            go.Scatter(x=self.input_df['Start'], y=self.input_df['Time_diff_sec'].values, name='ground truth'),
            go.Scatter(x=self.input_df['Start'].iloc[:self.train_size], y=train_pred[:,0], name='train prediction'),
            go.Scatter(x=self.input_df['Start'].iloc[self.train_size:], y=test_pred[:,0], name='test prediction')
        ], layout=pred_layout)
        pred_fig.show()

#         pio.write_image(pred_fig, f'{self.dir_path}/prediction.png', width=985, height=525)
#         print(f'Prediction plot saved at {self.dir_path}')