# Run in Google Colab

In [None]:
!pip install tsai
!pip install geopandas
!pip install geojson
!pip install pytorch_lightning
!pip install neptune-client

Restart kernel after installation

In [None]:
!git clone https://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@github.com/yuasosnin/aihacks-2022-fields

# Imports, data and setup

In [None]:
%cd aihacks-2022-fields

In [None]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import geopandas as gpd

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import NeptuneLogger

from src import EnsembleVotingModel, StackKFoldDataModule, StackTransformer
from src.torch_utils import KFoldLoop, PrintMetricsCallback

In [None]:
# api_key for neptune.ai logger
with open('api_key') as f:
    API_KEY = f.read()

In [None]:
bad_id = 2932

In [None]:
data = {'ts': OrderedDict(), 'id': {}}
data['id'] = {
    'full': gpd.read_file('data/train_dataset_id.geojson').fillna(0).drop(bad_id).reset_index(drop=True), 
    'pred': gpd.read_file('data/test_dataset_id.geojson')}

In [None]:
for name in ['base', 'modis', 'modis_evi', 'landsat', 'landsat_evi', 'sentinel', 'sentinel_evi']:
    data['ts'][name] = {}
    data['ts'][name]['full'] = pd.read_csv(f'data/train_dataset_{name}.csv') \
        .fillna(0.0).drop(bad_id).reset_index(drop=True)
    data['ts'][name]['pred'] = pd.read_csv(f'data/test_dataset_{name}.csv') \
        .fillna(0.0).reset_index(drop=True)

In [None]:
def tensor_stack(*dfs):
    return torch.tensor(np.concatenate([df[:,None,:] for df in dfs], axis=1), dtype=torch.float32)

In [None]:
keys = ['modis', 'landsat', 'sentinel']
tensors_train = [tensor_stack(data['ts']['base']['full'].values)] + [
    tensor_stack(data['ts'][df]['full'].values, data['ts'][f'{df}_evi']['full'].values) for df in keys]
tensors_pred = [tensor_stack(data['ts']['base']['pred'].values)] + [
    tensor_stack(data['ts'][df]['pred'].values, data['ts'][f'{df}_evi']['pred'].values) for df in keys]

In [None]:
CONST = True

In [None]:
tensor_const_train = torch.tensor(data['id']['full'][['area', 'lat', 'lon', 'alt', 'soil_carbon']].values, dtype=torch.float32)
tensor_const_pred = torch.tensor(data['id']['pred'][['area', 'lat', 'lon', 'alt', 'soil_carbon']].values, dtype=torch.float32)

if CONST:
    tensors_train.append(tensor_const_train)
    tensors_pred.append(tensor_const_pred)

In [None]:
tensor_y = torch.tensor(data['id']['full']['crop'].values.astype(np.int), dtype=torch.long)
tensors_train.append(tensor_y)

In [None]:
train_dataset = TensorDataset(*tensors_train)
pred_dataset = TensorDataset(*tensors_pred)

In [None]:
if CONST:
    c_ins = [t.shape[1] for t in pred_dataset.tensors[:-1]]
    c_in_const = pred_dataset.tensors[-1].shape[1]
    seq_lens = [t.shape[2] for t in pred_dataset.tensors[:-1]]
    print(c_ins, seq_lens)
else:
    c_ins = [t.shape[1] for t in pred_dataset.tensors]
    seq_lens = [t.shape[2] for t in pred_dataset.tensors]
    print(c_ins, seq_lens)

# CV training

In [None]:
pl.seed_everything(5)
pl_model = StackTransformer(
    c_ins=c_ins,
    seq_lens=seq_lens,
    d_model=64,
    nhead=8,
    dim_feedforward=64,
    d_head=64,
    num_layers=4,
    num_head_layers=0,
    dropout=0.2,
    fc_dropout=0.5,
    activation=nn.GELU,
    const_features=CONST,
    c_in_const=c_in_const,
    num_const_leayers=1,
    lr=0.0001,
    wd=0.00001,
    gamma=0.99)

In [None]:
pl_data = StackKFoldDataModule(
    train_dataset=train_dataset, 
    pred_dataset=pred_dataset,
    const=CONST,
    batch_size=64,
    seed=1)

In [None]:
best_checkpointer = ModelCheckpoint(
    save_top_k=1, save_last=True, monitor='valid_recall', mode='max', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY, project='fant0md/aihacks-2022-fields', log_model_checkpoints=False)
lr_monitor = LearningRateMonitor(logging_interval='epoch')
printer = PrintMetricsCallback(
    metrics=['valid_recall', 'train_recall', 'valid_loss', 'train_loss'])

trainer = pl.Trainer(
    log_every_n_steps=1,
    logger=neptune_logger,
    callbacks=[best_checkpointer, lr_monitor, printer],
    max_epochs=100,
    accelerator='auto',
    devices=1)

In [None]:
internal_fit_loop = trainer.fit_loop
trainer.fit_loop = KFoldLoop(
    ensemble_model=EnsembleVotingModel, num_folds=8, checkpoint_type='last')
trainer.fit_loop.connect(internal_fit_loop)

In [None]:
trainer.fit(pl_model, pl_data)

# Submission

In [None]:
ckpt_paths = trainer.fit_loop.checkpoint_paths
# ckpt_paths = [x.replace('last', 'best') for x in ckpt_paths]
infer_model = EnsembleVotingModel(StackTransformer, ckpt_paths, mode='mean')
trainer.test(infer_model, pl_data.test_dataloader())

In [None]:
preds = trainer.predict(infer_model, pl_data.predict_dataloader())

In [None]:
submission = pd.read_csv('sample_solution.csv')
submission['crop'] = torch.cat(preds).argmax(1)

In [None]:
version = 'v369'

In [None]:
submission.to_csv(f'submission_{version}.csv', index=False)
neptune_logger.experiment['submission'].upload(f'submission_{version}.csv')

In [None]:
dir = '/content/aihacks-2022-fields/.neptune/None/version_None/checkpoints'
for ckpt in os.listdir(dir):
    neptune_logger.experiment[f'training/model/checkpoints/{ckpt}'].upload(f'{dir}/{ckpt}')
neptune_logger.log_model_summary(model=infer_model.models[0], max_depth=2)