# This Example shows the Prediction of Bike Flow in the NYC City using the deep learning model ST_ResNet.

Find the details of the ST-ResNet model in the <a href="https://dl.acm.org/doi/10.5555/3298239.3298479">corresponding paper</a>

Details of the dataset can be found <a href="https://github.com/FIBLAB/DeepSTN">here</a>.

### Import Modules and Define Parameters

In [1]:
import sys
import os
import numpy as np
import time
import torch
import torch.nn as nn
import torchvision.transforms as transforms

from geotorchai.models.grid import STResNet
from geotorchai.datasets.grid import BikeNYCDeepSTN
from geotorchai.utility import TorchAdapter
import matplotlib.pyplot as plt

# Import Apache Sedona
from sedona.spark import *

## Import PySpark
from pyspark.sql import SparkSession


## Import distributed modules
from torch.utils.data import DistributedSampler, DataLoader
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from pyspark.ml.torch.distributor import TorchDistributor

import warnings
# Ignore FutureWarning warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
config = SedonaContext.builder().master(MASTER_URL).config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.4.1,'
           'org.datasyslab:geotools-wrapper:1.4.0-28.2').getOrCreate()

sedona = SedonaContext.create(config)
sc = sedona.sparkContext

23/08/11 19:04:54 WARN Utils: Your hostname, Kanchans-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.1.6 instead (on interface en0)
23/08/11 19:04:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/kanchan/.ivy2/cache
The jars for the packages stored in: /Users/kanchan/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.4_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1d2e36c3-45b5-4b04-a07d-2b25693d9e52;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/kanchan/.pyenv/versions/3.11.0/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-shaded-3.4_2.12;1.4.1 in central
	found org.datasyslab#geotools-wrapper;1.4.0-28.2 in central
:: resolution report :: resolve 85ms :: artifacts dl 5ms
	:: modules in use:
	org.apache.sedona#sedona-spark-shaded-3.4_2.12;1.4.1 from central in [default]
	org.datasyslab#geotools-wrapper;1.4.0-28.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-1d2e36c3-45b5-4b04-a07d-2b25693d9e52
	confs: [default]
	0 artifacts copied, 2 already retrieved (0kB/3ms)
23/08/11 19:04:54 WARN NativeCodeLoader: Unable to load

## Get Path Prefix

In [3]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else ''

print(PATH_PREFIX)




## Define parameters

In [4]:
learning_rate = 0.0002
batch_size = 32

PATH_TO_DATASET = PATH_PREFIX + "data/deepstn"

### Loading Dataset

In [5]:
## Load training and test dataset
full_dataset = BikeNYCDeepSTN(root = PATH_TO_DATASET, download = True)

## get the min-max-difference of normalized data for future use in calculating actual losses
min_max_diff = full_dataset.get_min_max_difference()

File downloading started...


100%|█████████████████████████| 17708640/17708640 [00:00<00:00, 36660662.92it/s]


File downloading finished
File downloading started...


100%|████████████████████████████████| 18224/18224 [00:00<00:00, 5532498.27it/s]

File downloading finished





### Method to Return Model

In [6]:
def get_model():
    len_closeness = 3
    len_period = 4
    len_trend = 4
    nb_residual_unit = 4
    map_height, map_width = 21, 12
    nb_flow = 2
    
    ## Define Model
    model = STResNet((len_closeness, nb_flow, map_height, map_width),
                 (len_period, nb_flow, map_height, map_width),
                 (len_trend, nb_flow, map_height, map_width),
                 external_dim=None, nb_residual_unit=nb_residual_unit)
    return model

### Train the Model
Error will be high since the training is performed only for 2 epochs

In [7]:
def train_one_epoch(model, train_loader, optimizer, loss_fn, device):
    model.train()
    for i, sample in enumerate(train_loader):
        X_c = sample["x_closeness"].type(torch.FloatTensor).to(device)
        X_p = sample["x_period"].type(torch.FloatTensor).to(device)
        X_t = sample["x_trend"].type(torch.FloatTensor).to(device)
        Y_batch = sample["y_data"].type(torch.FloatTensor).to(device)

        with torch.set_grad_enabled(True):
            optimizer.zero_grad()

            # Forward pass
            outputs = model(X_c, X_p, X_t)
            loss = loss_fn(outputs, Y_batch)

            # Backward and optimize
            loss.backward()
            optimizer.step()
    return loss.item()

In [8]:
def train_model(model, loader, device):
    ## Define hyper-parameters
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    for e in range(epoch_nums):
        epoch_loss = train_one_epoch(model, loader, optimizer, loss_fn, device)
        print('Epoch [{}/{}], Training Loss: {:.4f}'.format(e + 1, epoch_nums, epoch_loss))

In [9]:
def train_distributed(use_gpu):
    backend = "nccl" if use_gpu else "gloo"
    dist.init_process_group(backend)
    device = int(os.environ["LOCAL_RANK"]) if use_gpu  else "cpu"
    model = get_model().to(device)
    model_ddp = DDP(model)
    sampler = DistributedSampler(full_data)
    loader = DataLoader(full_data, batch_size=batch_size, sampler=sampler)

    train_model(model_ddp, loader, device)

## Start Distributed Training

In [None]:
print("Starting training")
distributor = TorchDistributor(num_processes=2, local_mode=True, use_gpu=False)
distributor.run(train_distributed, False)