In [1]:
from __future__ import division, print_function

import copy
import csv
import os
import random
import sys
from collections import Counter
from math import log

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
# torch packages
import torch.nn as nn
import unicodecsv
from pandas.api.types import is_string_dtype
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset

pd.set_option('display.max_rows', None)
from collections import OrderedDict

from LSTM import Model


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def generate_prefix_data(data, min_length, max_length):
    # generate prefix data (each possible prefix becomes a trace)

    case_length = data.groupby(case_id_col)[activity_col].transform(len)
    data.loc[:, 'case_length'] = case_length.copy()
    dt_prefixes = data[data['case_length'] >= min_length].groupby(case_id_col).head(min_length)
    dt_prefixes["prefix_nr"] = 1
    dt_prefixes["orig_case_id"] = dt_prefixes[case_id_col]
    for nr_events in range(min_length, max_length+1):
        tmp = data[data['case_length'] >= nr_events].groupby(case_id_col).head(nr_events)
        tmp["orig_case_id"] = tmp[case_id_col]
        tmp[case_id_col] = tmp[case_id_col].apply(lambda x: "%s_%s" % (x, nr_events))
        tmp["prefix_nr"] = nr_events
        dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)
    dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))
    return dt_prefixes

def split_data_strict(data, train_ratio):
    # split into train and test using temporal split and discard events that overlap the periods
    data = data.sort_values(sorting_cols, ascending=True, kind='mergesort')
    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(train_ratio*len(start_timestamps))]
    train = data[data[case_id_col].isin(train_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort')
    split_ts = test[timestamp_col].min()
    train = train[train[timestamp_col] < split_ts]
    return (train, test)

def get_label_numeric(data, bins, labels):
    y = data.groupby(case_id_col).first()[label_col]  # one row per case
    
    label_mapping = {}
    for idx, (lower, upper) in enumerate(zip(bins, bins[1:])):
        label = labels[idx]
        label_mapping[label] = idx
    
    return [label_mapping[label] for label in y.values.tolist()]


def groupby_pad_all(train, test, val, cols, activity_col):
    activity_train, label_lists_train = groupby_pad(train, cols, activity_col)
    activity_test,label_lists_test = groupby_pad(test, cols, activity_col)
    activity_val, label_lists_val = groupby_pad(val, cols, activity_col)
    return activity_train, activity_test, activity_val, label_lists_train, label_lists_test, label_lists_val

def groupby_pad(prefixes, cols, activity_col):
    ans_act, label_lists = groupby_caseID(prefixes, cols, activity_col)
    ######ACTIVITY########
    activity = pad_data(ans_act)
    return activity, label_lists

def pad_data(data):
    data[0] = nn.ConstantPad1d((0, max_prefix_length - data[0].shape[0]), 0)(data[0])
    padding = pad_sequence(data, batch_first=True, padding_value=0)
    return padding

def groupby_caseID(data, cols, col):
    groups = data[cols].groupby(case_id_col, as_index=True)
    #case_ids = groups.groups.keys()
    ans = [torch.tensor(list(y[col])) for _, y in groups]
    label_lists = [y[label_col].iloc[0] for _, y in groups]
    return ans, label_lists

def groupby_pad_all_num(train, test, val, cols, numerical_features_col):
    numerical_features_train = groupby_pad_num(train, cols, numerical_features_col)
    numerical_features_test = groupby_pad_num(test, cols, numerical_features_col)
    numerical_features_val = groupby_pad_num(val, cols, numerical_features_col)
    return numerical_features_train, numerical_features_test, numerical_features_val

def pad_data_num(data, max_prefix_length):
    padded_data = [nn.ConstantPad2d((0, 0, 0, max_prefix_length - seq.shape[0]), 0)(seq) for seq in data]
    padding = torch.stack(padded_data, dim=0)
    return padding

def groupby_pad_num(prefixes, cols, numerical_features_col):
    ans_num = groupby_caseID_num(prefixes, cols, numerical_features_col)
    numerical_features = pad_data_num(ans_num, max_prefix_length)  # Pass max_prefix_length here
    return numerical_features

def groupby_caseID_num(data, cols, numerical_features_col):
    groups = data.groupby(case_id_col, as_index=True)
    ans_num = [torch.tensor(y[numerical_features_col].values, dtype=torch.long) for _, y in groups]
    return ans_num

def create_index(log_df, column):
    """Creates an idx for a categorical attribute.
    Args:
        log_df: dataframe.
        column: column name.
    Returns:
        index of a categorical attribute pairs.
    """
    temp_list = temp_list = log_df[log_df[column] != 'none'][[column]].values.tolist()  # remove all 'none' values from the index
    subsec_set = {(x[0]) for x in temp_list}
    subsec_set = sorted(list(subsec_set))
    alias = dict()
    for i, _ in enumerate(subsec_set):
        alias[subsec_set[i]] = i
    # reorder by the index value
    alias = {k: v for k, v in sorted(alias.items(), key=lambda item: item[1])}
    return alias

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return torch.tensor(np.eye(num_classes, dtype='uint8')[y])

def create_indexes(i, data):
    dyn_index = create_index(data, i)
    index_dyn = {v: k for k, v in dyn_index.items()}
    dyn_weights = to_categorical(sorted(index_dyn.keys()), len(dyn_index))
    no_cols = len(data.groupby([i]))
    return dyn_weights,  dyn_index, index_dyn, no_cols

def prepare_inputs(X_train, X_test):
        global ce
        ce = ColumnEncoder()
        X_train, X_test = X_train.astype(str), X_test.astype(str)
        X_train_enc = ce.fit_transform(X_train)
        X_test_enc = ce.transform(X_test)
        return X_train_enc, X_test_enc, ce

# https://towardsdatascience.com/using-neural-networks-with-embedding-layers-to-encode-high-cardinality-categorical-variables-c1b872033ba2
class ColumnEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.maps = dict()

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            # encode value x of col via dict entry self.maps[col][x]+1 if present, otherwise 0
            X_copy.loc[:, col] = X_copy.loc[:, col].apply(lambda x: self.maps[col].get(x, -1)+1)
        return X_copy
    
    def get_maps(self):
        return self.maps

    def inverse_transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            values = list(self.maps[col].keys())
            # find value in ordered list and map out of range values to None
            X_copy.loc[:, col] = [values[i-1] if 0 < i <= len(values) else None for i in X_copy[col]]
        return X_copy

    def fit(self, X, y=None):
        # only apply to string type columns
        self.columns = [col for col in X.columns if is_string_dtype(X[col])]
        for col in self.columns:
            self.maps[col] = OrderedDict({value: num for num, value in enumerate(sorted(set(X[col])))})
        return self
  
def to_categorical_all(train, test, val, num_classes):
        """ 1-hot encodes a tensor """
        train_OHE = torch.tensor(np.eye(num_classes)[train])
        test_OHE = torch.tensor(np.eye(num_classes)[test])
        val_OHE = torch.tensor(np.eye(num_classes)[val])
        return train_OHE, test_OHE, val_OHE



In [6]:
case_id_col = "CaseID"
activity_col = "ActivityID"
timestamp_col = "CompleteTimestamp"
label_col = "Pump_Adjustment_Bin"
sorting_cols = [timestamp_col, activity_col]
cat_cols = [activity_col]
case_id_col = "CaseID"
activity_col = "ActivityID"
timestamp_col = "CompleteTimestamp"

min_prefix_length = 1
max_prefix_length = 132
batch_size = 256
learning_rate = 0.001
dropout = 0.1
lstm_size = 100
num_classes = 3
epochs = 50

# Define the bin edges and labels
bins = [0,10,25,90]
labels = ['few', 'medium', 'many']

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

#read in original data
df = pd.read_csv('data/event_data_context_alex2.csv', index_col ="Unnamed: 0")

In [7]:
df.head()

Unnamed: 0,CaseID,CompleteTimestamp,Vessel,ActivityID,lifecycle:transition,event_id,Filter 1 DeltaP_mean,Filter 1 DeltaP_ema,Filter 1 DeltaP_max,Filter 1 DeltaP_min,...,Pump Circulation Flow_std,Pump Circulation Flow_sum,Pump Circulation Flow_dif,Tank Pressure_mean,Tank Pressure_ema,Tank Pressure_max,Tank Pressure_min,Tank Pressure_std,Tank Pressure_sum,Tank Pressure_dif
2,20K12001E,2020-10-14 13:38:51,R501,Filters,complete,2,0.049914,0.050471,0.37381,0.040615,...,0.099007,100.634529,-0.025133,0.14096,0.222641,0.296219,0.005969,0.09377,320.684044,0.014511
3,20K12001E,2020-10-14 16:43:32,R501,1Pass_Prewet,complete,3,0.335981,0.045441,0.950388,0.0,...,0.274353,4971.595623,0.08258,0.67188,0.216346,0.944405,0.003246,0.359341,8366.920742,-0.165722
5,20K12001E,2020-10-14 18:20:57,R501,Pump adjustment,complete,5,0.046075,0.045479,0.050554,0.044231,...,0.0,0.0,0.0,0.116169,0.216196,0.285838,0.033318,0.085035,34.966731,-0.182889
6,20K12001E,2020-10-14 18:31:04,R501,Pump stop,complete,6,0.389302,0.046063,0.991595,0.01993,...,0.368901,137.49055,0.472329,0.578629,0.217301,0.639907,0.217287,0.139646,174.167334,0.417988
7,20K12001E,2020-10-14 18:32:41,R501,Pump adjustment,complete,7,0.381825,0.032229,0.991595,0.01993,...,0.362463,131.129489,-0.155598,0.444962,0.22196,0.639907,0.217287,0.205316,133.933439,0.410252
