# Amazon Deforestation Forecasting Using AI/ML and IoT Data
## Part II: Model Training and Evaluation
## AAI-530 Team 8 Final Project

Team 8:  Tyler Foreman

University of San Diego, Applied Artificial Intelligence

Date:  Feburary 26, 2023

GitHub Repository: https://github.com/t4ai/amazon-deforestation-forecasting

In [3]:
!pip install pytorch-forecasting

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.0.0-py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.4/140.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.80 (from pytorch-forecasting)
  Downloading fastapi-0.109.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.1.4-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting optuna<4.0.0,>=3.1.0 (from pytorch-forecasting)
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-optimizer<3.0.0,>=2.5.1 (from pytorch-forecasting)

In [4]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss

In [5]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load geo data layers from our data dir
root_dir = '/content/drive/MyDrive/USD/datasets/amazon_deforestation'

## Load the Prepared Dataset and Configure into TimeSeriesDataSet

Read in the dataset prepared in Stage I.  Load into TimeSeriesDataSet (input required by our TFT model).

In [8]:
# load in the prepared data
yearly_deforestation_df = pd.read_csv(root_dir + '/yearly_deforestation_merged_tft.csv')

In [10]:
# inspect
yearly_deforestation_df.head(20)

Unnamed: 0.1,Unnamed: 0,cluster,year,deforest_area_km,state,state_area,state_hydro_area,state_no_forest_area,cluster_area,cluster_no_forest_area,cluster_hydro_area,years_from_start
0,0,-1,2008.0,708.026741,AC,170799400.0,113.339475,70.051073,0.0,0.0,0.0,0
1,1,-1,2009.0,199.543029,RO,237780200.0,1766.08697,23295.406781,0.0,0.0,0.0,1
2,2,-1,2010.0,313.131808,AM,1589484000.0,51077.536558,48506.459703,0.0,0.0,0.0,2
3,3,-1,2011.0,190.86624,MT,513039600.0,1728.346,45068.972799,0.0,0.0,0.0,3
4,4,-1,2012.0,96.429847,PA,1238830000.0,44115.717775,66845.30003,0.0,0.0,0.0,4
5,5,-1,2013.0,126.674202,AM,1589484000.0,51077.536558,48506.459703,0.0,0.0,0.0,5
6,6,-1,2014.0,121.737476,AM,1589484000.0,51077.536558,48506.459703,0.0,0.0,0.0,6
7,7,-1,2015.0,139.550549,PA,1238830000.0,44115.717775,66845.30003,0.0,0.0,0.0,7
8,8,-1,2016.0,150.172549,RR,225342900.0,1753.155361,60902.109316,0.0,0.0,0.0,8
9,9,-1,2017.0,208.391069,MA,113737300.0,4818.959857,8166.20879,0.0,0.0,0.0,9


In [12]:
# remove null category
yearly_deforestation_df = yearly_deforestation_df[yearly_deforestation_df["cluster"] != -1]

In [14]:
# create a string column (categorical) for cluster - required by TimeSeriesDataset/Model
yearly_deforestation_df["cluster_name"] = yearly_deforestation_df["cluster"].apply(lambda x: "cl_" + str(x))

In [16]:
# inspect results
yearly_deforestation_df.sample(10)

Unnamed: 0.1,Unnamed: 0,cluster,year,deforest_area_km,state,state_area,state_hydro_area,state_no_forest_area,cluster_area,cluster_no_forest_area,cluster_hydro_area,years_from_start,cluster_name
9460,9460,980,2008.0,0.123974,MT,513039600.0,1728.346,45068.972799,63431.67,9731.788083,0.0,0,cl_980
11505,11505,1243,2008.0,1.842468,RO,237780200.0,1766.08697,23295.406781,37727.35,0.0,0.0,0,cl_1243
2792,2792,246,2012.0,0.361022,AM,1589484000.0,51077.536558,48506.459703,354469.6,0.0,31686.552427,4,cl_246
8461,8461,854,2019.0,0.110711,AM,1589484000.0,51077.536558,48506.459703,3547.134,0.0,2.119625,11,cl_854
14015,14015,1641,2009.0,0.103684,AM,1589484000.0,51077.536558,48506.459703,14727.98,7035.082614,2273.450018,1,cl_1641
2288,2288,195,2017.0,0.081903,AM,1589484000.0,51077.536558,48506.459703,179874.8,1629.877536,32303.655315,9,cl_195
5720,5720,547,2013.0,6.099357,MT,513039600.0,1728.346,45068.972799,142781.0,17536.879879,0.0,5,cl_547
5356,5356,508,2014.0,0.130459,PA,1238830000.0,44115.717775,66845.30003,21232.79,0.0,0.0,6,cl_508
5697,5697,545,2011.0,3.268426,MT,513039600.0,1728.346,45068.972799,1752135.0,20960.511346,0.0,3,cl_545
11270,11270,1214,2008.0,0.073398,RO,237780200.0,1766.08697,23295.406781,8486.711,0.0,0.0,0,cl_1214


In [22]:
# drop clusters that do not have enough time history
clusters = yearly_deforestation_df["cluster_name"].unique()
for cluster in clusters:
    cluster_year = yearly_deforestation_df[yearly_deforestation_df["cluster_name"] == cluster]
    # we need to give our model at least six years of readings
    if(len(cluster_year) < 6):
        yearly_deforestation_df = yearly_deforestation_df[yearly_deforestation_df["cluster_name"] != cluster]


In [26]:
# check how many clusters we have left
len(yearly_deforestation_df["cluster_name"].unique())

1379

### Create Train and Validation Datasets
Load dataframe into TimeSeriesDataSet and split at prediction horizon (3 years) for train/test

In [27]:
# setup parameters
max_prediction_length = 3
max_encoder_length = 3
min_encoder_length = 2

# training cutoff keeps series up to certain year
training_cutoff = yearly_deforestation_df["years_from_start"].max() - max_prediction_length

# define training dataset
training = TimeSeriesDataSet(
    yearly_deforestation_df[lambda x: x.years_from_start <= training_cutoff],
    time_idx="years_from_start",
    target="deforest_area_km",
    group_ids=["cluster_name"],
    min_encoder_length=min_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["cluster_name", "state"],
    static_reals=["state_area", "state_hydro_area", "state_no_forest_area", "cluster_area", "cluster_no_forest_area", "cluster_hydro_area"],
    time_varying_known_reals=["year"],
    time_varying_unknown_reals=["deforest_area_km"],
    target_normalizer=GroupNormalizer(
        groups=["cluster_name"], transformation="softplus"
    ),  # we normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

# define test dataset
validation = TimeSeriesDataSet.from_dataset(training, yearly_deforestation_df, predict=True, stop_randomization=True)

# create dataloaders for  our model
batch_size = 64

# if you have a strong GPU, feel free to increase the number of workers
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)




## Experiment 1: Train TFT Model to predict new deforestation area by year for each cluster



In [None]:
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=True, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    max_epochs=45,
    accelerator='gpu',
    devices=1,
    enable_model_summary=True,
    gradient_clip_val=0.1,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger)

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.001,
    hidden_size=160,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=160,
    output_size=7,  # there are 7 quantiles by default: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
    loss=QuantileLoss(),
    log_interval=10,
    reduce_on_plateau_patience=4)

trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader)
