In [1]:
import pandas as pd
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Step 1: Data Preparation
df_TS = pd.read_csv('/content/drive/MyDrive/UserLog.csv/UserLog.csv', names=['Date_Time','Event_type','Cluster','Duration','Total_users'])


Task 1: Predicting Login/Logout Times
For this task, predicting exact times could be complex due to the continuous nature of time. Instead, consider predicting the time until the next event as a regression problem.

Data Preparation:

X: Sequences of past events (e.g., event type, cluster, duration, total number of users logged in).

Y: Time until the next event (calculated as the difference between successive events).

In [4]:
df_TS.head()


Unnamed: 0,Date_Time,Event_type,Cluster,Duration,Total_users
0,Fri Jan 01 00:00:00 GMT 2010,LOGIN,FELL,1261840,1
1,Fri Jan 01 00:00:00 GMT 2010,LOGIN,LAKE,10058927,2
2,Fri Jan 01 00:00:00 GMT 2010,LOGIN,SIDE,6868990,3
3,Fri Jan 01 00:00:00 GMT 2010,LOGIN,LAKE,2997017,4
4,Fri Jan 01 00:00:00 GMT 2010,LOGIN,LAKE,8919800,5


In [5]:
df_TS.isna().sum()

Date_Time      0
Event_type     0
Cluster        0
Duration       0
Total_users    0
dtype: int64

In [6]:
# Check for duplicates
duplicates = df_TS[df_TS.duplicated()]

if not duplicates.empty:
    print("Duplicate rows found. Dropping them...")
    # Drop duplicates
    df_TS.drop_duplicates(inplace=True)
    print("Duplicates dropped.")
else:
    print("No duplicate rows found.")

Duplicate rows found. Dropping them...
Duplicates dropped.


In [7]:
import pytz

# Function to remove timezone information
def remove_timezone(dt):
    if dt.tzinfo is not None:
        # Convert to UTC first if datetime is timezone-aware
        dt = dt.astimezone(pytz.utc)
        # Remove timezone information
        return dt.replace(tzinfo=None)
    else:
        return dt



# Preprocessing steps:

Feature Engineering:

The features you've chosen are:

1. hour_of_day and day_of_week: These time-related features can capture cyclical patterns in login/logout activities. For instance, there might be more logins during weekdays and specific hours due to classes or work schedules at the university.
2. Event type: Knowing whether an event is a login or logout could provide insight into subsequent user behavior. Logouts might indicate the end of a session, potentially followed by a new login after a certain period.
3. Cluster: The specific cluster where the event occurred can indicate usage patterns, as some clusters might be busier or have different peak times.
4. Duration: The length of time a user was logged in might influence the timing of the next event, especially if longer sessions are followed by longer breaks.
5. Total number of users logged in: This gives a sense of how busy the system is, which might affect when the next login/logout event occurs.

Encoding:

In [8]:
from sklearn.preprocessing import LabelEncoder

# Convert 'Event type' and 'Cluster' into categorical variables
encoder_event_type = LabelEncoder()
df_TS['Event_type'] = encoder_event_type.fit_transform(df_TS['Event_type'])

encoder_cluster = LabelEncoder()
df_TS['Cluster'] = encoder_cluster.fit_transform(df_TS['Cluster'])

Feature Engineering:

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.utils import to_categorical
import numpy as np
from keras.preprocessing.sequence import TimeseriesGenerator

# Convert 'Date_Time' column to datetime format
df_TS['Date_Time'] = pd.to_datetime(df_TS['Date_Time'])
# Extract 'hour_of_day' using apply() with a lambda function
df_TS['hour_of_day'] = df_TS['Date_Time'].apply(lambda x: x.hour)
# Using standard Python datetime objects, use weekday() method
df_TS['day_of_week'] = df_TS['Date_Time'].apply(lambda x: x.weekday())
# Apply the function to each datetime object in the column
df_TS['Date_Time_Naive'] = df_TS['Date_Time'].apply(remove_timezone)
# Calculate the time difference between successive events
df_TS['time_until_next_event'] = (df_TS['Date_Time_Naive'].shift(-1) - df_TS['Date_Time_Naive'])
# Convert 'Timedelta' to seconds
df_TS['time_until_next_event_seconds'] = df_TS['time_until_next_event'].apply(lambda x: x.total_seconds())
# Drop the last row as it won't have a target value
df_TS.dropna(inplace=True)
## converting float to int
df_TS['time_until_next_event_seconds']=df_TS['time_until_next_event_seconds'].astype(int)
## reset the index
df_TS.reset_index(drop=True, inplace=True)




Creating Lags


In [10]:
n_lags = 50  # Number of lags (previous steps) to include as features
for i in range(1, n_lags + 1):
    df_TS[f'lag_{i}'] = df_TS['time_until_next_event_seconds'].shift(i)
df_TS.dropna(inplace=True)  # Drop rows with NaN values resulting from lagging


Rolling Statistics

In [11]:
window_size = 3
df_TS['rolling_mean'] = df_TS['time_until_next_event_seconds'].rolling(window=window_size).mean()
df_TS['rolling_std'] = df_TS['time_until_next_event_seconds'].rolling(window=window_size).std()
# Drop the last row as it won't have a target value
df_TS.dropna(inplace=True)
## convert float to int
n_lags = 20  # Number of lags (previous steps) to include as features
for i in range(1, n_lags + 1):
    df_TS[f'lag_{i}'] = df_TS[f'lag_{i}'].astype(int)

df_TS.reset_index(drop=True, inplace=True)

In [12]:
df_TS

Unnamed: 0,Date_Time,Event_type,Cluster,Duration,Total_users,hour_of_day,day_of_week,Date_Time_Naive,time_until_next_event,time_until_next_event_seconds,...,lag_43,lag_44,lag_45,lag_46,lag_47,lag_48,lag_49,lag_50,rolling_mean,rolling_std
0,2010-01-01 13:25:55+00:00,0,25,11797657,11,13,4,2010-01-01 13:25:55,0 days 00:09:11,551,...,192.0,1429.0,1210.0,526.0,1261.0,0.0,0.0,0.0,390.333333,213.551711
1,2010-01-01 13:35:06+00:00,0,28,4761830,12,13,4,2010-01-01 13:35:06,0 days 00:06:08,368,...,39.0,192.0,1429.0,1210.0,526.0,1261.0,0.0,0.0,463.666667,91.784167
2,2010-01-01 13:41:14+00:00,0,9,353170,13,13,4,2010-01-01 13:41:14,0 days 00:03:02,182,...,2211.0,39.0,192.0,1429.0,1210.0,526.0,1261.0,0.0,367.000000,184.502033
3,2010-01-01 13:44:16+00:00,0,9,641013,14,13,4,2010-01-01 13:44:16,0 days 00:02:51,171,...,2051.0,2211.0,39.0,192.0,1429.0,1210.0,526.0,1261.0,240.333333,110.699292
4,2010-01-01 13:47:07+00:00,1,9,0,13,13,4,2010-01-01 13:47:07,0 days 00:02:51,171,...,1139.0,2051.0,2211.0,39.0,192.0,1429.0,1210.0,526.0,174.666667,6.350853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2458850,2010-12-31 22:39:09+00:00,1,15,0,1,22,4,2010-12-31 22:39:09,0 days 00:14:19,859,...,101.0,160.0,330.0,502.0,830.0,107.0,143.0,183.0,439.666667,391.031116
2458851,2010-12-31 22:53:28+00:00,0,9,1303274,2,22,4,2010-12-31 22:53:28,0 days 00:02:46,166,...,485.0,101.0,160.0,330.0,502.0,830.0,107.0,143.0,370.000000,425.418617
2458852,2010-12-31 22:56:14+00:00,0,28,436217,3,22,4,2010-12-31 22:56:14,0 days 00:07:16,436,...,302.0,485.0,101.0,160.0,330.0,502.0,830.0,107.0,487.000000,349.303593
2458853,2010-12-31 23:03:30+00:00,1,28,0,2,23,4,2010-12-31 23:03:30,0 days 00:11:42,702,...,378.0,302.0,485.0,101.0,160.0,330.0,502.0,830.0,434.666667,268.002488


###Task 1 - Given a sequence of events (in the format listed above) predict the next 100 login / logout times

In [47]:
df_TS.columns

Index(['Event_type', 'Cluster', 'Duration', 'Total_users', 'hour_of_day',
       'day_of_week', 'Date_Time_Naive', 'time_until_next_event',
       'time_until_next_event_seconds', 'lag_1', 'lag_2', 'lag_3', 'lag_4',
       'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11',
       'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18',
       'lag_19', 'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25',
       'lag_26', 'lag_27', 'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32',
       'lag_33', 'lag_34', 'lag_35', 'lag_36', 'lag_37', 'lag_38', 'lag_39',
       'lag_40', 'lag_41', 'lag_42', 'lag_43', 'lag_44', 'lag_45', 'lag_46',
       'lag_47', 'lag_48', 'lag_49', 'lag_50', 'rolling_mean', 'rolling_std'],
      dtype='object')

In [48]:
X=df_TS[['lag_1', 'lag_2','lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10','hour_of_day','day_of_week']]
Y=df_TS[['time_until_next_event_seconds']]

In [49]:
X

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,hour_of_day,day_of_week
10,14,13,12,13,14,13,14,13,12,11,14,4
11,15,14,13,12,13,14,13,14,13,12,14,4
12,16,15,14,13,12,13,14,13,14,13,14,4
13,15,16,15,14,13,12,13,14,13,14,14,4
14,16,15,16,15,14,13,12,13,14,13,14,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2458850,2,1,2,3,4,3,4,5,6,5,22,4
2458851,1,2,1,2,3,4,3,4,5,6,22,4
2458852,2,1,2,1,2,3,4,3,4,5,22,4
2458853,3,2,1,2,1,2,3,4,3,4,23,4


In [50]:
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator

sequence_length = 1
batch_size = 32

# Select the last 100 rows for the test set
X_test = X[-100:]
y_test = Y[-100:]

# Remove the last 100 rows from the dataset to create the training set
X_train = X[:-100]
y_train = Y[:-100]

# Normalizing both X_train and X_test
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Normalizing both y_train and y_test
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

# Create TimeseriesGenerator for training and validation
train_generator = TimeseriesGenerator(X_train_scaled, y_train_scaled, length=sequence_length, batch_size=batch_size)
val_generator = TimeseriesGenerator(X_test_scaled, y_test_scaled, length=sequence_length, batch_size=batch_size)


In [51]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

n_features = X_train_scaled.shape[1]  # Number of features in the dataset

# Define LSTM model architecture
lstm_model_1 = Sequential()
lstm_model_1.add(LSTM(units=50, return_sequences=True, input_shape=(sequence_length, X_train.shape[1])))
lstm_model_1.add(LSTM(units=50, return_sequences=False))
lstm_model_1.add(Dense(units=1))

# Compile the model
lstm_model_1.compile(optimizer='adam', loss='mean_absolute_error')


In [52]:
history = lstm_model_1.fit(train_generator, epochs = 5, validation_data = val_generator)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Make predictions on the test set
y_pred = lstm_model_1.predict(val_generator)

y_test_aligned = y_test_scaled[:len(y_pred)]


r2 = r2_score(y_test_aligned, y_pred)  # Assuming y_val_aligned is already aligned appropriately

# Calculate mean squared error
mse = mean_squared_error(y_test_aligned, y_pred)

# Calculate mean absolute error
mae = mean_absolute_error(y_test_aligned, y_pred)

print("R^2 Score:", r2)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

R^2 Score: -0.10081528487699254
Mean Squared Error: 0.00031798512688562984
Mean Absolute Error: 0.011286316940529149


In [54]:
import tensorflow as tf

# Define the file path where you want to save the model
model_path = 'task1_model.h5'

# Save the model
tf.keras.models.save_model(lstm_model_1, model_path)

print("Model saved successfully at:", model_path)

Model saved successfully at: task1_model.h5


  tf.keras.models.save_model(lstm_model_1, model_path)


### Task-2: Given a sequence of events (in the format listed above) predict the next 100 values for the number of students using the computers.

In [18]:
df_TS

Unnamed: 0,Date_Time,Event_type,Cluster,Duration,Total_users,hour_of_day,day_of_week,Date_Time_Naive,time_until_next_event,time_until_next_event_seconds,...,lag_43,lag_44,lag_45,lag_46,lag_47,lag_48,lag_49,lag_50,rolling_mean,rolling_std
0,2010-01-01 13:25:55+00:00,0,25,11797657,11,13,4,2010-01-01 13:25:55,0 days 00:09:11,551,...,192.0,1429.0,1210.0,526.0,1261.0,0.0,0.0,0.0,390.333333,213.551711
1,2010-01-01 13:35:06+00:00,0,28,4761830,12,13,4,2010-01-01 13:35:06,0 days 00:06:08,368,...,39.0,192.0,1429.0,1210.0,526.0,1261.0,0.0,0.0,463.666667,91.784167
2,2010-01-01 13:41:14+00:00,0,9,353170,13,13,4,2010-01-01 13:41:14,0 days 00:03:02,182,...,2211.0,39.0,192.0,1429.0,1210.0,526.0,1261.0,0.0,367.000000,184.502033
3,2010-01-01 13:44:16+00:00,0,9,641013,14,13,4,2010-01-01 13:44:16,0 days 00:02:51,171,...,2051.0,2211.0,39.0,192.0,1429.0,1210.0,526.0,1261.0,240.333333,110.699292
4,2010-01-01 13:47:07+00:00,1,9,0,13,13,4,2010-01-01 13:47:07,0 days 00:02:51,171,...,1139.0,2051.0,2211.0,39.0,192.0,1429.0,1210.0,526.0,174.666667,6.350853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2458850,2010-12-31 22:39:09+00:00,1,15,0,1,22,4,2010-12-31 22:39:09,0 days 00:14:19,859,...,101.0,160.0,330.0,502.0,830.0,107.0,143.0,183.0,439.666667,391.031116
2458851,2010-12-31 22:53:28+00:00,0,9,1303274,2,22,4,2010-12-31 22:53:28,0 days 00:02:46,166,...,485.0,101.0,160.0,330.0,502.0,830.0,107.0,143.0,370.000000,425.418617
2458852,2010-12-31 22:56:14+00:00,0,28,436217,3,22,4,2010-12-31 22:56:14,0 days 00:07:16,436,...,302.0,485.0,101.0,160.0,330.0,502.0,830.0,107.0,487.000000,349.303593
2458853,2010-12-31 23:03:30+00:00,1,28,0,2,23,4,2010-12-31 23:03:30,0 days 00:11:42,702,...,378.0,302.0,485.0,101.0,160.0,330.0,502.0,830.0,434.666667,268.002488


In [19]:
df_TS.dropna(inplace=True)

# Extract 'hour_of_day' using apply() with a lambda function
df_TS['hour_of_day'] = df_TS['Date_Time'].apply(lambda x: x.hour)

# Using standard Python datetime objects, use weekday() method
df_TS['day_of_week'] = df_TS['Date_Time'].apply(lambda x: x.weekday())

# Remove the original datetime column if it's no longer needed
df_TS.drop('Date_Time', axis=1, inplace=True)


In [20]:
# Convert 'Event type' and 'Cluster' into categorical variables
encoder_event_type = LabelEncoder()
df_TS['Event_type'] = encoder_event_type.fit_transform(df_TS['Event_type'])

encoder_cluster = LabelEncoder()
df_TS['Cluster'] = encoder_cluster.fit_transform(df_TS['Cluster'])


In [21]:

n_lags = 10  # Number of lags (previous steps) to include as features
for i in range(1, n_lags + 1):
    df_TS[f'lag_{i}'] = df_TS['Total_users'].shift(i)
df_TS.dropna(inplace=True)  # Drop rows with NaN values resulting from lagging


In [22]:
# Set the number of rows for the test set
test_size = 100

# Calculate the split point
split_point = len(df_TS) - test_size

# Split the data into training and testing sets
train = df_TS.iloc[:split_point, :]
test = df_TS.iloc[split_point:, :]

# Separate features and target variable for training and testing sets
X_train, y_train = train.drop('Total_users', axis=1), train['Total_users']
X_test, y_test = test.drop('Total_users', axis=1), test['Total_users']


In [23]:
n_lags = 10  # Number of lags (previous steps) to include as features
for i in range(1, n_lags + 1):
    df_TS[f'lag_{i}'] = df_TS[f'lag_{i}'].astype(int)
df_TS.dropna(inplace=True)  # Drop rows with NaN values resulting from lagging


Normalisation:

In [24]:
from sklearn.preprocessing import MinMaxScaler

# Identify non-numeric columns in X_train
non_numeric_columns = [col for col in X_train.columns if not pd.api.types.is_numeric_dtype(X_train[col])]

# Drop non-numeric columns from X_train and X_test
X_train_numeric = X_train.drop(columns=non_numeric_columns)
X_test_numeric = X_test.drop(columns=non_numeric_columns)

# Normalize the numeric data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)

# Use the same scaler for X_test
X_val_scaled = scaler.transform(X_test_numeric)

# Normalizing the target variable y
scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))


In [25]:
sequence_length = 10
batch_size = 32

train_generator = TimeseriesGenerator(X_train_scaled, y_train_scaled, length=sequence_length, batch_size=batch_size)
val_generator = TimeseriesGenerator(X_val_scaled, y_test_scaled, length=sequence_length, batch_size=batch_size)


In [26]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_scaled.shape[0], X_train_scaled.shape[1])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')



In [27]:
# Train the model
history = model.fit(train_generator, epochs=5, validation_data=val_generator)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
predictions = model.predict(val_generator)




In [32]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



y_val_aligned = y_test_scaled[:len(predictions)]
r2 = r2_score(y_val_aligned, predictions)  # Assuming y_val_aligned is already aligned appropriately

# Calculate mean squared error
mse = mean_squared_error(y_val_aligned, predictions)

# Calculate mean absolute error
mae = mean_absolute_error(y_val_aligned, predictions)

print("R^2 Score:", r2)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

R^2 Score: 0.6270232995248098
Mean Squared Error: 7.402081620110233e-06
Mean Absolute Error: 0.0021116281549135845


In [45]:
import tensorflow as tf

# Define the file path where you want to save the model
model_path = 'task2_model.h5'

# Save the model
tf.keras.models.save_model(model, model_path)

print("Model saved successfully at:", model_path)


Model saved successfully at: task2_model.h5


  tf.keras.models.save_model(model, model_path)


##Task 3 - Given a sequence of events (in the format listed above) predict the next 100 cluster names for either logins / logouts

In [None]:
df_TS.columns

In [35]:
X=df_TS[['lag_1', 'lag_2','lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10']]
Y=df_TS[['Cluster']]

In [36]:
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator

sequence_length = 1
batch_size = 32

# Select the last 100 rows for the test set
X_test = X[-100:]
y_test = Y[-100:]

# Remove the last 100 rows from the dataset to create the training set
X_train = X[:-100]
y_train = Y[:-100]

# Normalizing both X_train and X_test
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Normalizing both y_train and y_test
scaler_y = MinMaxScaler(feature_range=(0, 1))
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

# Create TimeseriesGenerator for training and validation
train_generator = TimeseriesGenerator(X_train_scaled, y_train_scaled, length=sequence_length, batch_size=batch_size)
val_generator = TimeseriesGenerator(X_test_scaled, y_test_scaled, length=sequence_length, batch_size=batch_size)

In [37]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

n_features = X.shape[1]
n_clusters = len(df_TS['Cluster'].unique())

lstm_model_3 = Sequential()
lstm_model_3.add(LSTM(50, activation='relu', input_shape=(sequence_length, n_features)))
lstm_model_3.add(Dense(n_clusters, activation='softmax'))  # Use softmax for multi-class classification
lstm_model_3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [38]:
history = lstm_model_3.fit(train_generator, epochs = 5, validation_data = val_generator)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the predicted probabilities to predicted class labels
y_pred_classes = np.argmax(y_pred, axis=1)

# Convert the true labels to class labels (if they are one-hot encoded)
y_test_classes = np.argmax(y_test_scaled, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test_classes, y_pred_classes)

# Calculate precision
precision = precision_score(y_test_classes, y_pred_classes, average='weighted')

# Calculate recall
recall = recall_score(y_test_classes, y_pred_classes, average='weighted')

# Calculate F1-score
f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [44]:
import tensorflow as tf

# Define the file path where you want to save the model
model_path = 'task3_model.h5'

# Save the model
tf.keras.models.save_model(lstm_model_3, model_path)

print("Model saved successfully at:", model_path)


Model saved successfully at: task3_model.h5


  tf.keras.models.save_model(lstm_model_3, model_path)
