# Task 3: Time-Series Model

## Instructions on How to run the model

To execute the model, ensure you follow these steps:

1. Mount a Google drive to a specified path.
2. Navigate to the directory containing this file as your working directory.
3. Run all cells one by one except the one which trains the model.
4. Before runing the cell make sure to store files of all models which has trained weights in the current directory.

---

## 0. Mounting a drive

We'll first mount the drive to the directory '/content/drive'. Afterward, we'll switch the working directory to the folder containing this file.

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Deep Learning/Coursework/Task 3")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Import required libraries and load data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.layers import Input, Dense, concatenate, LSTM, GRU, Dropout, BatchNormalization
from keras.models import Sequential, load_model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.callbacks import EarlyStopping, ModelCheckpoint
from datetime import datetime, timedelta
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pytz
import warnings

warnings.filterwarnings('ignore')

ROOT_FOLDER = "/content/drive/MyDrive/Deep Learning/Coursework/Task 3"

In [None]:
# Function used to load the dataset:

def load_data():

    # Loading the dataset from csv file:
    df = pd.read_csv(f'{ROOT_FOLDER}/UserLog.csv', names=['date_time', 'event_type', 'cluster', 'duration', 'users'])

    print('The dataset has been loaded:')
    print('The first few rows of dataset: ')
    print(df.head())

    return df

df = load_data()

The dataset has been loaded:
The first few rows of dataset: 
                      date_time event_type cluster  duration  users
0  Fri Jan 01 00:00:00 GMT 2010      LOGIN    FELL   1261840      1
1  Fri Jan 01 00:00:00 GMT 2010      LOGIN    LAKE  10058927      2
2  Fri Jan 01 00:00:00 GMT 2010      LOGIN    SIDE   6868990      3
3  Fri Jan 01 00:00:00 GMT 2010      LOGIN    LAKE   2997017      4
4  Fri Jan 01 00:00:00 GMT 2010      LOGIN    LAKE   8919800      5


## 2. Data Preprocessing

The time series data I had was inconsistent because some date_time entries were in BST while others were in GMT. Therefore, in this phase, I have standardized them to a single timezone. Subsequently, I encountered white spaces in the event_type and cluster columns, which I promptly eliminated. The pivotal step involved creating a new column called "interval," which calculates the difference between two consecutive seconds. This column will be utilized later in predicting the date_time (Task-1).


In [None]:
# Define time zones

bst_timezone = pytz.timezone('Europe/London')
gmt_timezone = pytz.timezone('GMT')

def parse_date(timestamp_str):
    # Convert GMT date_times to BST
    timestamp = None
    try:
        # Parse as BST
        datetime_obj = datetime.strptime(timestamp_str, '%a %b %d %H:%M:%S BST %Y')
        timestamp = bst_timezone.localize(datetime_obj)
    except ValueError:
        # Parse as GMT, if fails to pass BST
        datetime_obj = datetime.strptime(timestamp_str, '%a %b %d %H:%M:%S GMT %Y')
        timestamp = gmt_timezone.localize(datetime_obj) + timedelta(hours=1)

    if timestamp != None:
        original_timestamp = pytz.timezone('UTC').localize(timestamp) if pd.isnull(timestamp) or timestamp.tzinfo is None else timestamp
        bst_timestamp = original_timestamp.astimezone(bst_timezone)
        return bst_timestamp

In [None]:
def preprocessing(df):

    # parse date_time to one timezone (BST):
    df['date_time'] = df['date_time'].apply(parse_date)

    # Remove trailing spaces:
    df['event_type'] = df['event_type'].map(lambda x: x.strip())
    df['cluster'] = df['cluster'].map(lambda x: x.strip())

    # Initialize LabelEncoder
    event_type_encoder, cluster_encoder = LabelEncoder(), LabelEncoder()

    # Fit label encoder and transform categories to numerics:
    df['event_type'] = event_type_encoder.fit_transform(df['event_type'])
    df['cluster'] = cluster_encoder.fit_transform(df['cluster'])

    login_duration_scaler = RobustScaler()
    df['duration'] = login_duration_scaler.fit_transform(np.array(df['duration']).reshape(-1, 1)).reshape(-1)

    df = df.sort_values(by='date_time')

    df['interval'] = df['date_time'].diff().dt.total_seconds()
    df.dropna(inplace=True)
    df['interval'] = df['interval'].astype(int)

    interval_scaler = MinMaxScaler(feature_range=(0, 1))
    df['interval'] = interval_scaler.fit_transform(df['interval'].values.reshape(-1, 1))

    return df

df = preprocessing(df)

## 3. Predicting Date & Time

1. I've create a function called `window_sequencing` which will give me the values of `X_test, X_train, y_train, X_val, y_val, X_test, y_test, and date_time object of last row of dataframe`.
2. The model has build using several `GRU` layers and `BatchNormalization` and after training the model it's been stored in the file `date_time_model.h5`.
3. I've got following result metrics:

    R2 score: -0.01706
    
    Mean squared error: 0.00006

    Mean absolute error: 0.00098

In [None]:
# Window sequencing
def window_sequencing(col, size=10, train_test_split_size=0.2, val_test_size=0.4):
    X = []
    Y = []
    for i in range(len(col.values) - size) :
      X.append(col.values[i:i+size])  # First 9 points
      Y.append(col.values[i+size])    # 10th data point

    # Divide dataset into feature and target
    X = np.array(X)
    y = np.array(Y)

    # Split data into training, validation, and testing sets
    X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=train_test_split_size, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=val_test_size, shuffle=False)

    return df.iloc[-len(X_test)+1].date_time, X_train.reshape(-1, size, 1), y_train, X_val.reshape(-1, size, 1), y_val, X_test.reshape(-1, size, 1), y_test

In [None]:
timestamp_validation_last, X_train, y_train, X_valid, y_valid, X_test, y_test = window_sequencing(df['interval'])

print("Following are the shapes of data:\n")
print(f"X train shape : {X_train.shape}")
print(f"y train shape : {y_train.shape}")
print(f"X validation shape : {X_valid.shape}")
print(f"y validation shape : {y_valid.shape}")
print(f"X testing shape : {X_test.shape}")
print(f"y testing shape : {y_test.shape}")

Following are the shapes of data:

X train shape : (1967701, 10, 1)
y train shape : (1967701,)
X validation shape : (295155, 10, 1)
y validation shape : (295155,)
X testing shape : (196771, 10, 1)
y testing shape : (196771,)


In [None]:
model1 = Sequential([
    GRU(units=512, input_shape=(10, 1), return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),

    GRU(units=256, return_sequences=True),
    BatchNormalization(),
    Dropout(0.4),

    GRU(units=128, return_sequences=True),
    BatchNormalization(),
    Dropout(0.4),

    GRU(units=256, return_sequences=True),
    BatchNormalization(),
    Dropout(0.3),

    GRU(units=128, return_sequences=False),
    BatchNormalization(),
    Dropout(0.3),

    Dense(units=32, activation='relu'),
    Dropout(0.2),

    Dense(units=1)
])

# Compile the model
model1.compile(optimizer=Adam(learning_rate=0.001), loss='huber_loss', metrics=['mae', 'mse'])

# Summarise the model
model1.summary()

# Checkpoint creation to save the model
checkpoint = ModelCheckpoint(f'{ROOT_FOLDER}/date_time_model.h5', monitor='huber_loss', metrics=['mae'])

# Model training
history = model1.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=40, batch_size=1024, callbacks=[checkpoint])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 10, 512)           791040    
                                                                 
 batch_normalization (Batch  (None, 10, 512)           2048      
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 10, 512)           0         
                                                                 
 gru_1 (GRU)                 (None, 10, 256)           591360    
                                                                 
 batch_normalization_1 (Bat  (None, 10, 256)           1024      
 chNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 10, 256)           0

In [None]:
# load the model from the location
model = load_model(f'{ROOT_FOLDER}/date_time_model.h5')

In [None]:
predictions = model.predict(X_test).round().astype(int)

print(f"\nR2 score: {r2_score(y_test, predictions):.5f}")
print(f"Mean squared error: {mean_squared_error(y_test, predictions):.5f}\n")
print(f"Mean absolute error: {mean_absolute_error(y_test, predictions):.5f}")


R2 score: -0.01706
Mean squared error: 0.00006

Mean absolute error: 0.00098


In [None]:
curr_timestamp, forecasted_dates_times = timestamp_validation_last, []

# Add each predicted time interval to the previous timestamp to generate the next timestamp
for interval in predictions:
    interval_seconds = interval.item()
    n_timestamp = curr_timestamp + pd.Timedelta(seconds=interval_seconds)
    forecasted_dates_times.append(n_timestamp)
    curr_timestamp = n_timestamp

In [None]:
df_intervals = pd.DataFrame({'test_date_time': df.iloc[-len(X_test):].date_time, 'predicted_date_time': forecasted_dates_times})
df_intervals.head(50)

Unnamed: 0,test_date_time,predicted_date_time
2262867,2010-12-01 16:35:40+00:00,2010-12-01 16:35:44+00:00
2262868,2010-12-01 16:35:44+00:00,2010-12-01 16:35:44+00:00
2262869,2010-12-01 16:35:50+00:00,2010-12-01 16:35:44+00:00
2262870,2010-12-01 16:35:56+00:00,2010-12-01 16:35:44+00:00
2262871,2010-12-01 16:35:58+00:00,2010-12-01 16:35:44+00:00
2262872,2010-12-01 16:36:04+00:00,2010-12-01 16:35:44+00:00
2262873,2010-12-01 16:36:13+00:00,2010-12-01 16:35:44+00:00
2262874,2010-12-01 16:36:15+00:00,2010-12-01 16:35:44+00:00
2262875,2010-12-01 16:36:19+00:00,2010-12-01 16:35:44+00:00
2262876,2010-12-01 16:36:23+00:00,2010-12-01 16:35:44+00:00


---

## 4. Predicting number of students

1. In this task, I've used `MinMaxScaler` on the `users` column.
2. I've used two `LSTM` layers with `50 units` and `relu` activation function.
3. During compilation of the model, I've placed a callback (ModelCheckpoint) which will store the model in the file `num_of_students.h5`.
4. `'Adam'` optimizer was used as a part of the compilation.
4. Later I've tested the model using prediction on the testing dataset.
5. Once I got the output from the prediction, later I've used `inverse_tranform` to get the actual values.
6. Lastly, I've converted the data into `int` format to get the accurate number of student.

In [None]:
# normalisation using MinMax scaler for 'users' column
users_scaler = MinMaxScaler(feature_range=(0, 1))
df['users'] = users_scaler.fit_transform(df['users'].values.reshape(-1, 1))

In [None]:
# Create new sequences
seq_length = 5
def new_sequences(df, seq_length):
    Xs, Ys = [], []
    for i in range(len(df) - seq_length):
        y = df[i + seq_length]
        x = df[i:(i + seq_length)].tolist()
        Ys.append(y)
        Xs.append(x)
    return np.array(Xs), np.array(Ys)

In [None]:
# dataset for LSTM with 'users' as target variable
X, y = new_sequences(df['users'].values, seq_length)

# Chronological splitting of time series dataset into training, and testing datasets
n = len(df)
X_train, X_test = X[0:int(n*0.8)], X[int(n*0.8):]
y_train, y_test = y[0:int(n*0.8)], y[int(n*0.8):]

In [None]:
# Reshaping - [samples, time_steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [None]:
# Build the model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

# Model Compilation
opt = 'adam'
model.compile(optimizer = opt, loss = 'mean_squared_error')

# Checkpoint creation
checkpoint = ModelCheckpoint(f'{ROOT_FOLDER}/num_of_students.h5', save_best_only=True, monitor='val_loss', mode='min')



In [None]:
model2 = model.fit(
            X_train,
            y_train,
            epochs=20,
            batch_size=64,
            validation_split=0.1,
            callbacks=[checkpoint],
            verbose=1
        )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# saving model with .h5 extension
model = load_model(f'{ROOT_FOLDER}/num_of_students.h5')

# evaluation on test set
predictions = model.predict(X_test)

# Inverse transformation
predictions = users_scaler.inverse_transform(predictions)

# Inverse transform to actual test set
test_y_inverse = users_scaler.inverse_transform(y_test.reshape(-1, 1))





In [None]:
test_y_inverse = list(test_y_inverse.flatten()[-100:])
test_y_inverse = list(np.round(test_y_inverse).astype(int))
predictions = list(predictions.flatten()[-100:])
predictions = list(np.round(predictions).astype(int))

test_y_inverse_df = pd.DataFrame(test_y_inverse, columns=['actual'])
predictions_df = pd.DataFrame(predictions, columns=['predicted'])

df_pred = pd.concat([test_y_inverse_df, predictions_df], axis=1)

print("Predicting the last 100 records in the test set:\n")

print(df_pred)

Predicting the last 100 records in the test set:

    actual  predicted
0       23         27
1       22         28
2       23         29
3       22         29
4       21         30
..     ...        ...
95       2         11
96       3         11
97       2         11
98       1         11
99       0         11

[100 rows x 2 columns]


In [None]:
# accuracy calculation
mse_value = mean_squared_error(test_y_inverse, predictions)
mae_value = mean_absolute_error(test_y_inverse, predictions)
r2 = r2_score(test_y_inverse, predictions)

# Printing accuracy
print(f'Mean Squared Error: {mse_value}\n')
print(f'Mean Absolute Error: {mae_value}\n')
print(f'R^2 Score: {r2}\n')

Mean Squared Error: 78.75

Mean Absolute Error: 8.75

R^2 Score: -1.7002839155659797



---

## 5. Predicting Cluster

1. Firstly, I've created numerical features like `hour, minute, day, month, year, day_of_week` from *date_time* to predict clusters.
2. I've utilised `LSTM` layer and one `BatchNormalization` layer. The `units` in the layer are ranges from 64 to 32. The `relu` activation function was used.
3. The last layer has `softmax` activation function because this approach is of classification.
4. During compilation, `'Adam'` optimization was used.
5. Following are the metrices I've got:

    Accuracy: 0.113
    
    Precision: 0.059
    
    Recall: 0.113
    
    F1 Score: 0.045

In [None]:
# Convert date time into numerical features (hour, minute, day, month, year, day_of_week):
df['hour'] = df['date_time'].map(lambda x: x.hour)
df['minute'] = df['date_time'].map(lambda x: x.minute)
df['day'] = df['date_time'].map(lambda x: x.day)
df['month'] = df['date_time'].map(lambda x: x.month)
df['year'] = df['date_time'].map(lambda x: x.year)
df['day_of_week'] = df['date_time'].map(lambda x: x.isoweekday() % 7)

df.drop(columns=['date_time'], inplace=True)


# Separate features and target variable
X = df[['duration', 'year', 'month', 'day', 'hour', 'minute', 'day_of_week']]
y = df['cluster']

# Standardize features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Encode target variable
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
# Split the data into training and testing sets
train_size = int(0.8 * len(df))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Standardize features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape input data for LSTM
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

In [None]:
# Define the LSTM model
model = Sequential([
    LSTM(64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True),
    Dropout(0.3),
    LSTM(32, activation='relu'),
    BatchNormalization(),
    Dense(32, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')
])



In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
checkpoint = ModelCheckpoint(f'{ROOT_FOLDER}/cluster_names.h5', monitor='sparse_categorical_crossentropy', mode='min')
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 7, 64)             16896     
                                                                 
 dropout (Dropout)           (None, 7, 64)             0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 batch_normalization (Batch  (None, 32)                128       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 37)                1221      
                                                        

In [None]:
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=512, verbose=1, callbacks=[checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




In [None]:
model = load_model(f'{ROOT_FOLDER}/cluster_names.h5')
y_pred = model.predict(X_test_reshaped)
y_pred = np.argmax(y_pred, axis=1)





array([29, 29, 29, ...,  9,  9,  9])

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.11337431494039778
Precision: 0.05934853052605717
Recall: 0.11337431494039778
F1 Score: 0.04508880758307765


In [None]:
# Predict the next 100 cluster values
next_values = []
last_X = X_train_reshaped[-1:].copy()  # Use the last training sample as the initial input for prediction

for _ in range(100):
    # Predict the next cluster value
    next_value = model.predict(last_X.reshape(1, X_train_reshaped.shape[1], 1), verbose=0)
    next_value = np.argmax(next_value, axis=1)

    next_values.append(next_value)

    # Update the input for the next prediction
    last_X[0, :-1, 0] = last_X[0, 1:, 0]  # Shift the existing values to the left
    last_X[0, -1, 0] = next_value          # Append the predicted value at the end

# Convert predicted cluster values back to original labels
predicted_clusters = encoder.inverse_transform(next_values)

print("Predicted next 100 cluster values:")
print(predicted_clusters)

Predicted next 100 cluster values:
[29 15 28 28 28 15 15 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 28 28 28 28]


---