In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import csv

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/dataset-filled/dataset_filled'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-filled/dataset_filled/500101216.csv
/kaggle/input/dataset-filled/dataset_filled/500101114.csv
/kaggle/input/dataset-filled/dataset_filled/500119046.csv
/kaggle/input/dataset-filled/dataset_filled/500119074.csv
/kaggle/input/dataset-filled/dataset_filled/500101092.csv
/kaggle/input/dataset-filled/dataset_filled/500119086.csv
/kaggle/input/dataset-filled/dataset_filled/500101042.csv
/kaggle/input/dataset-filled/dataset_filled/500119043.csv
/kaggle/input/dataset-filled/dataset_filled/500101093.csv
/kaggle/input/dataset-filled/dataset_filled/500101010.csv
/kaggle/input/dataset-filled/dataset_filled/500101008.csv
/kaggle/input/dataset-filled/dataset_filled/500101015.csv
/kaggle/input/dataset-filled/dataset_filled/500101040.csv
/kaggle/input/dataset-filled/dataset_filled/500101094.csv
/kaggle/input/dataset-filled/dataset_filled/500101037.csv
/kaggle/input/dataset-filled/dataset_filled/500119048.csv
/kaggle/input/dataset-filled/dataset_filled/500119089.csv
/kaggle/input/

In [2]:
# Import Libraries
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau



In [3]:
def create_dataset(dataset, lookback):
    """Transform a time series into a prediction dataset
    
    Args:
        dataset: A numpy array of time series, first dimension is the time steps
        lookback: Size of window for prediction
    """
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+1:i+lookback+1]
        X.append(feature)
        y.append(target)
    return torch.tensor(X), torch.tensor(y)

In [4]:
def train_valid_split(timeseries):
    # train-valid split with 10/2~11/16 [0:2734], 11/17~11/30[2734:]
    # 10/2 ~ 10/11 [:711]
    train_size = 2734
    valid_size = len(timeseries) - train_size
    train, valid = timeseries[:train_size], timeseries[train_size:]
    print(len(train), len(valid))
    # window size approximately = 3 days
    lookback = 72 * 2
    X_train, y_train = create_dataset(train, lookback=lookback)
    X_valid, y_valid = create_dataset(valid, lookback=lookback)
    # X_test, y_test = create_dataset(test, lookback=lookback)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    return X_train, y_train, X_valid, y_valid

## Build a basic RNN model

In [5]:
class UBikeRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, drop_prob, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, dropout=drop_prob, batch_first=True)    
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        # Initialize hidden state with zeros
        # h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
#         print(x.size(0)) # x.size(0) = batch size
        hidden = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device)
        x = x.view(len(x),1,-1)
#         print(x.shape)
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        out = self.sig(out)
        return out

In [6]:
class UBikeLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, drop_prob, output_dim):
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_layers = layer_dim
        self.output_size = output_dim
        
        # initialize LSTM   
        self.block = nn.Sequential(
            nn.LSTM(input_dim, hidden_dim, layer_dim, dropout=drop_prob, batch_first=True),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid(),
        )
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, dropout=drop_prob, batch_first=True)
        # the output of LSTM is classified by linear and sigmoid functions
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        h_0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device) # hidden state
        c_0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device) # internal state
        x = x.view(len(x),1,-1)
        lstm_out, (hn, cn)  = self.lstm(x, (h_0.detach(), c_0.detach()))
        hn = hn.view(-1, self.hidden_dim) # reshaping the data for Dense layer next
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.linear(lstm_out)
        out = self.sigmoid(out)
        
        return out

## Loss function

In [7]:
def getLoss(pred, label):
    loss = torch.mean(3 * torch.abs(pred-label) * (torch.abs(pred-1/3) + torch.abs(pred-2/3)))
#     loss = torch.mean(torch.square(pred-label))
#     print(f"loss = {loss}")
    return loss

## Configurations

In [8]:
config = {
    "batch_size": 32,
    "data_filepath": '/kaggle/input/stop001-sorted/500101001_sorted.csv',
    "inference_filepath": '/kaggle/input/stop001-1204-1210-inf/stop001_inf_1204_to_1210.csv',
    "epochs": 500,
    "learning_rate": 1e-4,
    "weight_decay": 5e-3,
    "save_dir": "/kaggle/working/",
    "model_name": "stop001-RNN-v1.ckpt",
    "early_stop": 150,
}
# model parameters
window_size = 72 * 2
input_dim = window_size
hidden_dim = 1024   # the hidden dim
layer_dim = 3   # the number of hidden layers
output_dim = window_size
drop_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Start Training!!!

In [9]:
def train(model, config, train_loader, valid_loader, device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) 
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

    n_epochs, best_loss, step, early_stop_count = config['epochs'], 10000, 0, 0
    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
#         train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_loader:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x) 
            loss = getLoss(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
#             train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
#             train_pbar.set_postfix({'loss': loss.detach().item()})
#             print(f'Epoch [{epoch+1}/{n_epochs}]')
#             print(f"loss: {loss.detach().item()}")

        mean_train_loss = sum(loss_record)/len(loss_record)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = getLoss(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        # Note that step should be called after validate()
        scheduler.step(mean_valid_loss)
        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_dir'] + config['model_name']) # Save your best model
            if(epoch % 10 == 0):
                print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1
            
        if(epoch % 10 == 0):
            print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        # writer.add_scalar('Loss/valid', mean_valid_loss, step)

            if early_stop_count >= config['early_stop']:
                print('\nModel is not improving, so we halt the training session.')
                print('best loss {:.3f}...'.format(best_loss))
                return

## Inference part

In [10]:
def predict(timeseries, test_loader, model, device, output_path, stop_name):
    model.eval() # Set your model to evaluation mode.
    preds = []
    tots = []
    titles = []

    window_size = 72 * 2
#     input_dim = window_size
#     output_dim = window_size
    test = timeseries[-window_size:].tolist()
#     print(f"test: {test}")

    i=0
    for month,day,weekday,hr,mins,lat,lng,act, tot, title in test_loader:
#         print(i)
        i+=1
        tot = tot.tolist()
        title = list(title)
        x = torch.FloatTensor(test[-window_size:])
        x = x.view(-1,len(x))
        x = x.to(device)      
        tots = tots + tot
        titles = titles + title
        with torch.no_grad():  
#             model.hidden = (torch.zeros(1,1,model.hidden_dim),
#                             torch.zeros(1,1,model.hidden_dim))
            pred = model(x)  
            test.append(pred.tolist()[-1][-1])
            preds.append(pred.detach().cpu())

    preds = test[window_size:]
#     print(f"preds: {test}")
    print(len(preds), len(tots), len(titles))
    plt.plot(test)
    plt.show()
    plt.savefig(stop + ".png")
    assert len(preds) == len(tots)
    assert len(tots) == len(titles)
    prediction = [['id','sbi']]
    for (pred, tot, title) in zip(preds, tots, titles):
        title = title.split('_')
        real_title = title[0] + '_' + stop + '_' + title[2]
        prediction.append([real_title, pred*tot])
    
    with open(output_path, 'w', newline='') as file:
    # Step 4: Using csv.writer to write the list to the CSV file
        writer = csv.writer(file)
        writer.writerows(prediction) # Use writerows for nested list
    
    return

In [None]:
# Visualize input dataset
# Iterate through 112 stops
i=0
for dirname, _, filenames in os.walk('/kaggle/input/dataset-filled/dataset_filled'):
    for filename in filenames:
        stop = filename.split('.')[0]
#         if stop != '500101035': continue
        config["model_name"] = "stop" + stop + "-RNN-v1.ckpt"
        output_path = 'lstm_pred_' + stop + '.csv'
        print("Start training stop " + stop + "\n")
        df = pd.read_csv('/kaggle/input/dataset-filled/dataset_filled/' + filename)
        df_inf = pd.read_csv("/kaggle/input/stop001-1204-1210-inf/stop001_inf_1204_to_1210.csv")
        # print(len(df['ratio']))
        # print(df['ratio'][:50])
        timeseries = df['ratio'].values.astype('float32') 
        print(timeseries)
        X_train, y_train, X_valid, y_valid = train_valid_split(timeseries)
        train_loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=config['batch_size'])
        valid_loader = DataLoader(TensorDataset(X_valid, y_valid), shuffle=True, batch_size=config['batch_size'])
#         print(train_loader, valid_loader)
#         model = UBikeRNN(input_dim=input_dim, hidden_dim=hidden_dim, layer_dim=layer_dim, drop_prob=drop_prob, output_dim=output_dim).to(device)
        model = UBikeLSTM(input_dim=input_dim, hidden_dim=hidden_dim, layer_dim=layer_dim, drop_prob=drop_prob, output_dim=output_dim).to(device)
        train(model, config, train_loader, valid_loader, device)
        print("Start predicting stop " + stop + "\n")
        test_data = df_inf.values.tolist()
        print(len(test_data))
        test_loader = DataLoader(test_data, shuffle=False, batch_size=1)
        print(len(test_loader))
        predict(timeseries, test_loader, model, device, output_path, stop_name=stop)
        print("Finish predicting stop " + stop + "\n")
        i+=1
#         if i == 3:
#             break
        print(f"Prgress {i}/112\n")
# plt.plot(timeseries)
# plt.show()

Start training stop 500101216

[1.      1.      1.      ... 0.95833 0.95833 0.95833]
2734 1200


  return torch.tensor(X), torch.tensor(y)


torch.Size([2590, 144]) torch.Size([2590, 144])
torch.Size([1056, 144]) torch.Size([1056, 144])
Saving model with loss 0.247...
Epoch [1/500]: Train loss: 0.2768, Valid loss: 0.2465
