In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
import time

import random

import torch
import torch.nn as nn

import torch.optim as optim

import copy
import matplotlib.pyplot as plt
import math

import os
import torch.nn.functional as F


In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!python --version

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

if torch.cuda.is_available():
    print("GPU is available.")
else:
    print("No GPU detected.")

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
data = pd.read_csv('station_153211-2022-09_2023-01.csv')

In [None]:
# Adding / removing columns.
data['DateTime'] = pd.to_datetime(data['created_at'], infer_datetime_format=True)
data.set_index('DateTime', inplace=True)
data.info()

data = data[(data.index.hour > 6) & (data.index.hour <= 22)]

data.drop('created_at', axis=1, inplace=True)

"""data.drop('unknown_count', axis=1, inplace=True)
data.drop('offline_count', axis=1, inplace=True)
data.drop('day_of_week', axis=1, inplace=True)
data.drop('is_weekend', axis=1, inplace=True)
data.drop('hour_bin', axis=1, inplace=True)
data.drop('month', axis=1, inplace=True)
data.drop('is_holiday', axis=1, inplace=True)"""

data.info()

In [None]:
data = data[~data.index.duplicated(keep='first')]

# Get unique dates in the dataset
unique_dates = []

for index in data.index:
    date = index.date()
    if date not in unique_dates:
        unique_dates.append(date)

data_r = pd.DataFrame(columns=data.columns)

k = 10

for date in unique_dates:

    # Filter data for the current date
    data_day = data[data.index.date == date]

    start_date = data_day.index[0]
    end_date = data_day.index[-1]
    datetime_index = pd.date_range(start=start_date, end=end_date, freq='5T')
    
    # Filter datetime_index to include only hours between 6 am and 10 pm
    datetime_index = datetime_index[(datetime_index.hour >= 6) & (datetime_index.hour < 22)]

    for target_time in datetime_index:

        data_c = copy.copy(data_day)
        data_c['time_difference'] = abs(data_c.index - target_time)
        data_c_sorted = data_c.sort_values(by='time_difference')
        k_closest_rows = data_c_sorted.head(k)
        k_closest_rows = k_closest_rows.drop(columns=['time_difference'])
        new_col = k_closest_rows.mean(axis=0)
        data_r.loc[target_time] = new_col

In [None]:
data_r['day_of_week'] = data_r.index.dayofweek
data_r['month'] = data_r.index.month
data_r['is_weekend'] = (data_r.index.dayofweek >= 5).astype(int)
data_r['5minute_bin'] = (data_r.index.hour * 60 + data_r.index.minute) // 5
data_r.drop('station_id', axis=1, inplace=True)
data_r.drop('outlet_count', axis=1, inplace=True)

In [None]:
scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(data_r),
                             index=data_r.index,
                             columns=data_r.columns)

print(df_normalized['occupied_count'].value_counts())
(data_r['occupied_count'].value_counts())

In [None]:
df_normalized = df_normalized[['occupied_count']]
# Calculate split indices
split_train = int(len(df_normalized) * 0.8)
split_val = int(len(df_normalized) * 0.9)

# Split the data
train_data = df_normalized.iloc[:split_train]
val_data = df_normalized.iloc[split_train:split_val]
test_data = df_normalized.iloc[split_val:]

train_data.info()

In [None]:
from torch.utils.data import Dataset, DataLoader

class datasetMaker(Dataset):
    def __init__(self, data, seq_len=10, future_steps=5):
        # Assuming 'data' is a numpy array or a pandas DataFrame, convert it to a numpy array
        self.data = data.values
        self.seq_len = seq_len
        self.future_steps = future_steps

    def __len__(self):
        # Subtract seq_len to avoid going out of bounds
        return len(self.data) - self.seq_len - self.future_steps + 1

    def __getitem__(self, index):
        # Get the sequence and label, and convert them to torch tensors
        seq = torch.tensor(self.data[index:index+self.seq_len], dtype=torch.float)
        label = torch.tensor(self.data[index+self.seq_len:index+self.seq_len+self.future_steps], dtype=torch.float)
        return seq, label

    
future_steps = 36
seq_len = 576 # changes
batch_size = 64
        
train_dataset = datasetMaker(train_data, seq_len, future_steps)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        
val_dataset = datasetMaker(val_data, seq_len, future_steps)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

test_dataset = datasetMaker(test_data, seq_len, future_steps)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

print(len(val_loader))

for data, label in train_loader:
    print(data.shape, label.shape)
    break

min_value = float('inf')
max_value = float('-inf')

min_valuel = float('inf')
max_valuel = float('-inf')

for seq, label in train_loader:
    batch_min = torch.min(seq).item()
    batch_max = torch.max(seq).item()
    
    min_value = min(min_value, batch_min)
    max_value = max(max_value, batch_max)
    
    batch_minl = torch.min(label).item()
    batch_maxl = torch.max(label).item()
    
    min_valuel = min(min_valuel, batch_minl)
    max_valuel = max(max_valuel, batch_maxl)

print("Minimum value:", min_value)
print("Maximum value:", max_value)

print("Minimum valuel:", min_valuel)
print("Maximum valuel:", max_valuel)


In [None]:
def train_epoch(epoch, optimizer, loss_function, model, train_loader, future_steps):
    total_loss = 0
    model.train()
    for batch_idx, (data,label) in enumerate(train_loader):
        optimizer.zero_grad()
        
        data = data.cuda()
        label = label.cuda()

        predictions = model(data)
        print(predictions.shape)
        print(label.shape)
        
        label = label
        
        loss_value = loss_function(predictions,label)
        loss_value.backward()
        optimizer.step()

        total_loss += loss_value.item()
        
    return total_loss / len(train_loader)

def validate_epoch(epoch, loss, model, val_loader, future_steps):
    total_loss = 0
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(val_loader):

            data = data.cuda()
            label = label.cuda()

            predictions = model(data)
            loss_value = loss(predictions, label)
            total_loss += loss_value.item()

    return total_loss / len(val_loader)

def a_proper_training(num_epoch, model, optimizer, loss_function, loader, future_steps):
    best_epoch = None
    best_model = None
    best_loss = None
    train_losses = list()
    val_losses = list()
    print("Begin Training")

    for epoch in range(num_epoch):
        start_time = time.time()  # Start time

        train_loss = train_epoch(epoch, optimizer, loss_function, model, train_loader, future_steps)
        val_loss = validate_epoch(0, criterion, model, val_loader, future_steps)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        if epoch == 0:
            best_loss = val_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch            
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch

        end_time = time.time()
        elapsed_time = end_time - start_time
        
        print(f"Epoch {epoch + 1}/{num_epoch}: Train Loss = {train_loss} Val Loss = {val_loss} Elapsed_time = {elapsed_time}")
            
    return (best_model, best_epoch, train_losses, val_losses)


In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 488)
        self.fc2 = nn.Linear(488, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 32)
        self.fc5 = nn.Linear(32, output_size)

    def forward(self, x):
        x = x.squeeze()
        x = F.relu(self.fc1(x))  # Apply ReLU activation to the first hidden layer
        x = torch.sigmoid(self.fc2(x))  # Apply sigmoid activation to the second hidden layer
        x = torch.sigmoid(self.fc3(x))  # Apply sigmoid activation to the third hidden layer
        x = torch.sigmoid(self.fc4(x))  # Apply sigmoid activation to the fourth hidden layer
        x = torch.sigmoid(self.fc5(x))  # Apply sigmoid activation to the output layer
        return x


In [None]:
model = MLP(input_size=576, output_size=36).cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss()

best_model, best_epoch, train_losses, val_losses = a_proper_training(
    10, 
    model,
    optimizer,
    criterion,
    train_loader,
    future_steps
)

In [None]:
plt.plot(train_losses, label="train")
plt.plot(val_losses, label="val")
plt.title("MSE Loss")
plt.legend()

In [None]:
import matplotlib.pyplot as plt
import math

model.eval()
for data, label in val_loader:
    
    data, label = data.to(device), label[:, :].unsqueeze(-1).to(device)
    
    with torch.no_grad():  
        predictions = best_model(data) 
    
    print("Minimum prediction value:", torch.min(predictions).item())
    print("Maximum prediction value:", torch.max(predictions).item())
    
    print("Minimum label value:", torch.min(label).item())
    print("Maximum label value:", torch.max(label).item())
    
    predictions = predictions.detach().cpu().numpy()
    labels = label.detach().cpu().numpy()
    
    batch_size, sequence_length= predictions.shape
    num_rows = int(math.ceil(batch_size / 4))
    num_cols = 4
    
    fig, axes = plt.subplots(4, num_cols, figsize=(15, 15))
    fig.suptitle("Predictions vs True Values")
    
    for i in range(16):
        row = i // num_cols
        col = i % num_cols
        
        ax = axes[row, col] if num_rows > 1 else axes[col]
        t = labels[i, :] 
        p = predictions[i, :]
        
        ax.plot(p, label="Predictions")
        ax.plot(t, label="True Values")
        ax.set_title(f"Sequence {i+1}")
        ax.set_ylim(0, 1)

        ax.legend()
    
    plt.tight_layout()
    plt.show()
    break 