In [1]:
import os, datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import geopandas as gpd
import geojson
import shapely
import shapely.geometry

import xarray as xr
import rioxarray as rxr

In [2]:
import matplotlib.pyplot as plt
import contextily

In [3]:
from src import *

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = read_data('data/train_dataset_train_2.csv')
data_test = read_data('data/test_dataset_test_2.csv')
data_ts, data_id = process_data(data)
data_ts_test, data_id_test = process_data(data_test)

In [6]:
def add_diff(data_ts):
    diff = data_ts.T.diff()[1:].T
    diff.columns = [f'{x}_' for x in diff.columns]
    return pd.concat((data_ts, diff), axis=1)

In [7]:
# data_ts = add_diff(data_ts)
# data_ts_test = add_diff(data_ts_test)

In [8]:
# dates = pd.date_range(start=None, end='2021/08/27', periods=135)
# data_ts_modis = pd.DataFrame(np.load('data_modis.npz')['arr_0'].squeeze(-1), columns=dates)
# data_ts_modis_test = pd.DataFrame(np.load('data_modis_test.npz')['arr_0'].squeeze(-1), columns=dates)

In [77]:
data_ts_modis = pd.read_csv('data/train_dataset_modis.csv').fillna(0.0)
data_ts_modis_test = pd.read_csv('data/test_dataset_modis.csv').fillna(0.0)

In [78]:
# data_ts_modis = add_diff(data_ts_modis)
# data_ts_modis_test = add_diff(data_ts_modis_test)

In [79]:
data_ts_landsat = pd.read_csv('data/train_dataset_landsat.csv').fillna(0)
data_ts_landsat_test = pd.read_csv('data/test_dataset_landsat.csv').fillna(0)

In [80]:
# data_ts_landsat = add_diff(data_ts_landsat)
# data_ts_landsat_test = add_diff(data_ts_landsat_test)

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [82]:
def get_dataset(data_ts, data_id, data_test_ts, data_test_id):
    data_ts_train, data_ts_val, data_id_train, data_id_val = train_test_split(data_ts, data_id, test_size=0.1, random_state=1)
    # data_train = pd.concat((data_ts_train, data_id_train[['area', 'lat', 'lon']]), axis=1)
    # data_val = pd.concat((data_ts_val, data_id_val[['area', 'lat', 'lon']]), axis=1)
    # data_test = pd.concat((data_ts_test, data_id_test[['area', 'lat', 'lon']]), axis=1)
    data_train = data_ts_train
    data_val = data_ts_val
    data_test = data_ts_test

    dataset = dict()
    dataset['full'] = {'X': data_ts, 'y': data_id['crop']}
    dataset['train'] = {'X': data_train, 'y': data_id_train['crop']}
    dataset['val'] = {'X': data_val, 'y': data_id_val['crop']}
    dataset['test'] = {'X': data_test}
    return dataset

In [83]:
dataset_orig = get_dataset(data_ts, data_id, data_ts_test, data_id_test)
dataset_modis = get_dataset(data_ts_modis, data_id, data_ts_modis_test, data_id_test)
dataset_landsat = get_dataset(data_ts_landsat, data_id, data_ts_landsat_test, data_id_test)
dataset_concat = get_dataset(
    pd.concat((data_ts, data_ts_modis, data_ts_landsat), axis=1), data_id, 
    pd.concat((data_ts_test, data_ts_modis_test, data_ts_landsat_test), axis=1), data_id_test)

# models

In [245]:
SEED = 1

In [246]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [247]:
rf = RandomForestClassifier(n_estimators=200, random_state=SEED, verbose=False)
rf.fit(**dataset_orig['train'])
preds = rf.predict(dataset_orig['val']['X'])
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.9603060360196845

In [248]:
rf_ = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_.fit(**dataset_modis['train'])
preds = rf_.predict(dataset_modis['val']['X'])
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.5986918472170635

In [249]:
rf_ls = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_ls.fit(**dataset_landsat['train'])
preds = rf_ls.predict(dataset_landsat['val']['X'])
recall_score(preds, dataset_landsat['val']['y'], average='macro')

0.6953205149240029

In [250]:
preds = (
    0.8*rf.predict_proba(dataset_orig['val']['X']) + 
    0.1*rf_.predict_proba(dataset_modis['val']['X']) +
    0.1*rf_ls.predict_proba(dataset_landsat['val']['X'])
).argmax(axis=1)

In [251]:
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.9645251067787923

In [255]:
rf = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf.fit(**dataset_concat['train'])
preds = rf.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9638116184726355

In [256]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(**dataset_concat['train'])
preds = gb.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9633086367457507

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=500,
    learning_rate=1,
    # depth=10,
    random_seed=1,
    verbose=False
)
model.fit(**dataset_concat['train'])
preds = model.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

In [186]:
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.6443026399804246

In [15]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

In [16]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=10)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.771540088080925

In [17]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=20)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.792783539714341

In [23]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=100)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.7834449463857383

# neural

In [84]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split

In [85]:
class StackDataset(Dataset):
    def __init__(self, *args, y=None):
        super().__init__()
        self.dfs = args
        self.y = y
        
    def __len__(self):
        return self.dfs[0].shape[0]
    
    def __getitem__(self, idx):
        if self.y is not None:
            return [df.iloc[idx,:].values for df in self.dfs], self.y[idx]
        else:
            return [df.iloc[idx,:].values for df in self.dfs]

In [86]:
import pytorch_lightning as pl

In [87]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

class StackDataModule(pl.LightningDataModule):
    def __init__(
        self, train_dataset, val_dataset, test_dataset=None, batch_size=64, num_workers=0):
    
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.train_dataset = train_dataset 
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.g = torch.Generator()
        self.g.manual_seed(1)
        
    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, shuffle=True,
                          batch_size=self.batch_size, num_workers=self.num_workers,
                          worker_init_fn=seed_worker, generator=self.g)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.val_dataset, shuffle=False,
                          batch_size=self.batch_size, num_workers=self.num_workers,
                          worker_init_fn=seed_worker, generator=self.g)
    
    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset, shuffle=False,
                          batch_size=self.batch_size, num_workers=self.num_workers,
                          worker_init_fn=seed_worker, generator=self.g)

In [96]:
import torchmetrics

class StackRNN(pl.LightningModule):
    def __init__(self, hidden_size, layers=1, bidirectional=True, dropout=0, **hparams):
        super().__init__()
        self.save_hyperparameters()

        self.hidden_size = hidden_size
        self.rnn1 = nn.GRU(
            input_size=1, hidden_size=hidden_size, num_layers=layers, bidirectional=bidirectional,
            batch_first=True, dropout=dropout)
        self.rnn2 = nn.GRU(
            input_size=1, hidden_size=hidden_size, num_layers=layers, bidirectional=bidirectional,
            batch_first=True, dropout=dropout)
        self.rnn3 = nn.GRU(
            input_size=1, hidden_size=hidden_size, num_layers=layers, bidirectional=bidirectional,
            batch_first=True, dropout=dropout)
        # self.fc = nn.Linear(hidden_size*(bidirectional+1), out_features=7)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size*(bidirectional+1)*layers, hidden_size*(bidirectional+1)*layers),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hidden_size*(bidirectional+1)*layers, out_features=7))
        self.act = nn.ReLU()
        
        self.criterion = nn.CrossEntropyLoss()
        self.train_recall = torchmetrics.Recall()
        self.valid_recall = torchmetrics.Recall()
        
    def forward(self, x1, x2, x3):
        bs = x1.shape[0]
        _, h1 = self.rnn1(x1)
        _, h2 = self.rnn2(x2)
        _, h3 = self.rnn3(x3)

        h1 = h1.permute(1,0,2).reshape(bs, -1)
        h2 = h2.permute(1,0,2).reshape(bs, -1)
        h3 = h3.permute(1,0,2).reshape(bs, -1)

        h = torch.stack((h1,h2,h3), axis=-1)
        h = torch.amax(h, axis=-1)
        # h = h1
        return self.fc(self.act(h))
    
    def _prepare_batch(self, batch, train=True):
        if train:
            (x1, x2, x3), y = batch
            x1 = x1.unsqueeze(-1).float()
            x2 = x2.unsqueeze(-1).float()
            x3 = x3.unsqueeze(-1).float()
            y = y.long()
            return (x1, x2, x3), y
        else:
            (x1, x2, x3) = batch
            x1 = x1.unsqueeze(-1).float()
            x2 = x2.unsqueeze(-1).float()
            x3 = x3.unsqueeze(-1).float()
            return (x1, x2, x3)

    def training_step(self, batch, batch_idx):
        (x1, x2, x3), y = self._prepare_batch(batch)
        output = self.forward(x1, x2, x3)
        loss = self.criterion(output, y)
        self.train_recall(output, y)
        self.log('train_loss', loss.item(), on_step=True, on_epoch=True)
        self.log('train_recall', self.train_recall, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        (x1, x2, x3), y = self._prepare_batch(batch)
        output = self.forward(x1, x2, x3)
        loss = self.criterion(output, y)
        self.valid_recall(output, y)
        self.log('valid_loss', loss.item(), on_step=True, on_epoch=True)
        self.log('valid_recall', self.valid_recall, on_step=True, on_epoch=True)
        return loss

    def predict_step(self, batch, batch_idx):
        (x1, x2, x3) = self._prepare_batch(batch, train=False)
        output = self.forward(x1, x2, x3)
        return output

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.wd)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, self.hparams.gamma)
        return [optimizer], [scheduler]

In [97]:
token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIwNWJjZTk5MS01OTA3LTQyOTYtYWEzYy0zMjZkNjAxNjI1ZDIifQ=="

In [98]:
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger, NeptuneLogger

In [104]:
g = torch.Generator()
g.manual_seed(1)
        
dataset = StackDataset(
    dataset_orig['full']['X'], dataset_modis['full']['X'], dataset_landsat['full']['X'], 
    y=dataset_orig['full']['y'])
train_dataset, val_dataset = random_split(dataset, lengths=[int(len(dataset)*0.8), int(len(dataset)*0.2)], generator=g)
test_dataset = StackDataset(dataset_orig['test']['X'], dataset_modis['test']['X'], dataset_landsat['test']['X'], y=None)

In [105]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_loss', filename='best')
csv_logger = CSVLogger(save_dir='')
neptune_logger = NeptuneLogger(
    api_key=token,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval ='epoch')

In [113]:
trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [107]:
pl_model = StackRNN(hidden_size=64, layers=2, dropout=0.5, lr=0.01, wd=0, gamma=0.9)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [108]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | rnn1         | GRU              | 100 K 
1 | rnn2         | GRU              | 100 K 
2 | rnn3         | GRU              | 100 K 
3 | fc           | Sequential       | 67.6 K
4 | act          | ReLU             | 0     
5 | criterion    | CrossEntropyLoss | 0     
6 | train_recall | Recall           | 0     
7 | valid_recall | Recall           | 0     
--------------------------------------------------
368 K     Trainable params
0         Non-trainable params
368 K     Total params
1.473     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-50
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
from tsai.all import *

In [220]:
import torchmetrics

class StackTSAITransformer(pl.LightningModule):
    def __init__(self, d_model=64, n_head=1, d_ffn=128, d_head=128, dropout=0.1, activation="relu", n_layers=1, **hparams):
        super().__init__()
        self.save_hyperparameters()
        self.model1 = TST(
            c_in=1, c_out=d_head, seq_len=70,
            d_model=d_model, n_heads=n_head, d_ff=d_ffn, dropout=dropout, act=activation, n_layers=n_layers)
        self.model2 = TST(
            c_in=1, c_out=d_head, seq_len=139,
            d_model=d_model, n_heads=n_head, d_ff=d_ffn, dropout=dropout, act=activation, n_layers=n_layers)
        self.model3 = TST(
            c_in=1, c_out=d_head, seq_len=18,
            d_model=d_model, n_heads=n_head, d_ff=d_ffn, dropout=dropout, act=activation, n_layers=n_layers)
        
        self.fc = nn.Sequential(
            nn.ReLU(),
            nn.Linear(d_head, d_head),
            # nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(d_head, out_features=7))
        
        self.criterion = nn.CrossEntropyLoss()
        self.train_recall = torchmetrics.Recall()
        self.valid_recall = torchmetrics.Recall()
        
    def forward(self, x1, x2, x3):
        h1 = self.model1.forward(x1)
        h2 = self.model2.forward(x2)
        h3 = self.model3.forward(x3)

        h = torch.stack((h1,h2,h3), axis=-1)
        h = torch.amax(h, axis=-1)
        # h = h1
        return self.fc(h)
    
    def _prepare_batch(self, batch, train=True):
        if train:
            (x1, x2, x3), y = batch
            x1 = x1.unsqueeze(1).float()
            x2 = x2.unsqueeze(1).float()
            x3 = x3.unsqueeze(1).float()
            y = y.long()
            return (x1, x2, x3), y
        else:
            (x1, x2, x3) = batch
            x1 = x1.unsqueeze(1).float()
            x2 = x2.unsqueeze(1).float()
            x3 = x3.unsqueeze(1).float()
            return (x1, x2, x3)
    
    def training_step(self, batch, batch_idx):
        (x1, x2, x3), y = self._prepare_batch(batch)
        output = self.forward(x1, x2, x3)
        loss = self.criterion(output, y)
        self.train_recall(torch.tensor(output), y)
        self.log('train_loss', loss.item(), on_step=True, on_epoch=True)
        self.log('train_recall', self.train_recall, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        (x1, x2, x3), y = self._prepare_batch(batch)
        output = self.forward(x1, x2, x3)
        loss = self.criterion(output, y)
        self.valid_recall(torch.tensor(output), y)
        self.log('valid_loss', loss.item(), on_step=True, on_epoch=True)
        self.log('valid_recall', self.valid_recall, on_step=True, on_epoch=True)
        return loss

    def predict_step(self, batch, batch_idx):
        (x1, x2, x3) = self._prepare_batch(batch, train=False)
        output = self.forward(x1, x2, x3)
        return output

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.wd)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, self.hparams.gamma)
        return [optimizer], [scheduler]

In [221]:
neptune_logger = NeptuneLogger(
    api_key=token,
    project='fant0md/aihacks-2022-fields')
trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[lr_monitor], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [222]:
pl_model = StackTSAITransformer(
    d_model=64, n_head=1, d_ffn=64, d_head=128, dropout=0, activation="relu", n_layers=2,
    lr=0.0001, wd=0, gamma=0.99)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [223]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | model1       | TST              | 628 K 
1 | model2       | TST              | 1.2 M 
2 | model3       | TST              | 198 K 
3 | fc           | Sequential       | 17.4 K
4 | criterion    | CrossEntropyLoss | 0     
5 | train_recall | Recall           | 0     
6 | valid_recall | Recall           | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.168     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-66
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
from torchvision.ops import MLP

In [None]:
MLP