In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import geopandas as gpd
import geojson
import shapely
import shapely.geometry
from sklearn.metrics import recall_score

In [3]:
import matplotlib.pyplot as plt
import contextily

In [4]:
from src import *

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = read_data('data/train_dataset_train_2.csv')
data_test = read_data('data/test_dataset_test_2.csv')
data_ts, data_id = process_data(data)
data_ts_test, data_id_test = process_data(data_test)

In [7]:
data_ts_modis = pd.read_csv('data/train_dataset_modis.csv').fillna(0.0)
data_ts_modis_test = pd.read_csv('data/test_dataset_modis.csv').fillna(0.0)
data_ts_modis_2020 = pd.read_csv('data/train_dataset_modis_2020.csv').fillna(0.0)
data_ts_modis_test_2020 = pd.read_csv('data/test_dataset_modis_2020.csv').fillna(0.0)

In [8]:
data_ts_landsat = pd.read_csv('data/train_dataset_landsat.csv').fillna(0)
data_ts_landsat_test = pd.read_csv('data/test_dataset_landsat.csv').fillna(0)
data_ts_landsat_2020 = pd.read_csv('data/train_dataset_landsat_2020.csv').fillna(0)
data_ts_landsat_test_2020 = pd.read_csv('data/test_dataset_landsat_2020.csv').fillna(0)

In [9]:
data_ts_sentinel = pd.read_csv('data/train_dataset_sentinel.csv').fillna(0)
data_ts_sentinel_test = pd.read_csv('data/test_dataset_sentinel.csv').fillna(0)
data_ts_sentinel_2020 = pd.read_csv('data/train_dataset_sentinel_2020.csv').fillna(0)
data_ts_sentinel_test_2020 = pd.read_csv('data/test_dataset_sentinel_2020.csv').fillna(0)

In [27]:
SEED = 54789

dataset_orig = get_dataset(data_ts, data_id, data_ts_test, data_id_test, random_state=SEED)

dataset_modis = get_dataset(data_ts_modis, data_id, data_ts_modis_test, data_id_test, random_state=SEED)
dataset_modis_2020 = get_dataset(data_ts_modis_2020, data_id, data_ts_modis_test_2020, data_id_test, random_state=SEED)

dataset_landsat = get_dataset(data_ts_landsat, data_id, data_ts_landsat_test, data_id_test, random_state=SEED)
dataset_landsat_2020 = get_dataset(data_ts_landsat_2020, data_id, data_ts_landsat_test_2020, data_id_test, random_state=SEED)

dataset_sentinel = get_dataset(data_ts_sentinel, data_id, data_ts_sentinel_test, data_id_test, random_state=SEED)
dataset_sentinel_2020 = get_dataset(data_ts_sentinel_2020, data_id, data_ts_sentinel_test_2020, data_id_test, random_state=SEED)

# sklearn

In [245]:
SEED = 1

In [246]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [247]:
rf = RandomForestClassifier(n_estimators=200, random_state=SEED, verbose=False)
rf.fit(**dataset_orig['train'])
preds = rf.predict(dataset_orig['val']['X'])
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.9603060360196845

In [248]:
rf_ = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_.fit(**dataset_modis['train'])
preds = rf_.predict(dataset_modis['val']['X'])
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.5986918472170635

In [249]:
rf_ls = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_ls.fit(**dataset_landsat['train'])
preds = rf_ls.predict(dataset_landsat['val']['X'])
recall_score(preds, dataset_landsat['val']['y'], average='macro')

0.6953205149240029

In [250]:
preds = (
    0.8*rf.predict_proba(dataset_orig['val']['X']) + 
    0.1*rf_.predict_proba(dataset_modis['val']['X']) +
    0.1*rf_ls.predict_proba(dataset_landsat['val']['X'])
).argmax(axis=1)

In [251]:
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.9645251067787923

In [255]:
rf = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf.fit(**dataset_concat['train'])
preds = rf.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9638116184726355

In [256]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(**dataset_concat['train'])
preds = gb.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9633086367457507

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=500,
    learning_rate=1,
    # depth=10,
    random_seed=1,
    verbose=False
)
model.fit(**dataset_concat['train'])
preds = model.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

In [186]:
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.6443026399804246

In [15]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

In [16]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=10)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.771540088080925

In [17]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=20)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.792783539714341

In [23]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=100)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.7834449463857383

# neural

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import NeptuneLogger
from src.printer import PrintMetricsCallback

In [29]:
with open('.api_key') as f:
    API_KEY = f.read()

In [30]:
from src import StackRNN, StackTransformer, StackInception, StackDataset, StackDataModule

In [31]:
train_dataset = StackDataset(
    dataset_orig['train']['X'], 
    dataset_modis['train']['X'], 
    dataset_modis_2020['train']['X'], 
    dataset_landsat['train']['X'], 
    dataset_landsat_2020['train']['X'], 
    dataset_sentinel['train']['X'], 
    dataset_sentinel_2020['train']['X'], 
    y=dataset_orig['train']['y'])

In [32]:
val_dataset = StackDataset(
    dataset_orig['val']['X'], 
    dataset_modis['val']['X'], 
    dataset_modis_2020['val']['X'], 
    dataset_landsat['val']['X'],
    dataset_landsat_2020['val']['X'],
    dataset_sentinel['val']['X'], 
    dataset_sentinel_2020['train']['X'], 
    y=dataset_orig['val']['y'])

In [33]:
test_dataset = StackDataset(
    dataset_orig['test']['X'], 
    dataset_modis['test']['X'], 
    dataset_modis_2020['test']['X'], 
    dataset_landsat['test']['X'],
    dataset_landsat_2020['test']['X'],
    dataset_sentinel['test']['X'], 
    dataset_sentinel_2020['test']['X'], 
    y=dataset_orig['test']['y'])

In [34]:
pred_dataset = StackDataset(
    dataset_orig['pred']['X'], 
    dataset_modis['pred']['X'], 
    dataset_modis_2020['pred']['X'], 
    dataset_landsat['pred']['X'], 
    dataset_landsat_2020['pred']['X'], 
    dataset_sentinel['pred']['X'],
    dataset_sentinel_2020['pred']['X'])

### RNN

In [105]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_loss', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval ='epoch')

In [113]:
trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [107]:
pl_model = StackRNN(hidden_size=64, layers=2, dropout=0.5, lr=0.01, wd=0, gamma=0.9)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [108]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | rnn1         | GRU              | 100 K 
1 | rnn2         | GRU              | 100 K 
2 | rnn3         | GRU              | 100 K 
3 | fc           | Sequential       | 67.6 K
4 | act          | ReLU             | 0     
5 | criterion    | CrossEntropyLoss | 0     
6 | train_recall | Recall           | 0     
7 | valid_recall | Recall           | 0     
--------------------------------------------------
368 K     Trainable params
0         Non-trainable params
368 K     Total params
1.473     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-50
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Transformer

In [71]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_recall', mode='max', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval='epoch')
printer = PrintMetricsCallback(metrics=['valid_recall'])

trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor, printer], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [72]:
pl_model = StackTransformer(
    seq_lens=[70, 139, 139, 18, 17, 55, 55],
    d_model=64, 
    nhead=2, 
    dim_feedforward=64, 
    d_head=64, 
    num_layers=2, 
    num_head_layers=4, 
    dropout=0.2, 
    fc_dropout=0,
    activation='relu', 
    reduction='avg', 
    lr=0.0001, wd=0, 
    # T_0=5, T_mult=1,
    gamma=0.99
)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, pred_dataset, batch_size=64)

In [None]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | models       | ModuleList       | 2.4 M 
1 | pool         | AvgReduce        | 0     
2 | act          | ReLU             | 0     
3 | head         | MLP              | 17.1 K
4 | criterion    | CrossEntropyLoss | 0     
5 | train_recall | Recall           | 0     
6 | valid_recall | Recall           | 0     
--------------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.675     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-140
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

epoch: -1
valid_recall: 0.1640625
--------------------------------------------------------------------------------


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

epoch: 0
valid_recall: 0.2256728708744049
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 1
valid_recall: 0.5383023023605347
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 2
valid_recall: 0.7060041427612305
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 3
valid_recall: 0.7660455703735352
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 4
valid_recall: 0.8260869383811951
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 5
valid_recall: 0.8840579986572266
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 6
valid_recall: 0.9047619104385376
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 7
valid_recall: 0.9130434989929199
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 8
valid_recall: 0.9316770434379578
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 9
valid_recall: 0.9420289993286133
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 10
valid_recall: 0.9440993666648865
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 11
valid_recall: 0.9440993666648865
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 12
valid_recall: 0.954451322555542
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 13
valid_recall: 0.9523809552192688
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 14
valid_recall: 0.9585921168327332
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 15
valid_recall: 0.954451322555542
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 16
valid_recall: 0.9627329111099243
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 17
valid_recall: 0.95652174949646
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 18
valid_recall: 0.95652174949646
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 19
valid_recall: 0.9523809552192688
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 20
valid_recall: 0.9606625437736511
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 21
valid_recall: 0.95652174949646
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 22
valid_recall: 0.9585921168327332
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 23
valid_recall: 0.9523809552192688
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 24
valid_recall: 0.9606625437736511
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 25
valid_recall: 0.9606625437736511
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 26
valid_recall: 0.9606625437736511
--------------------------------------------------------------------------------


In [67]:
preds = torch.cat(trainer.predict(pl_model, pl_data.test_dataloader())).argmax(1)
print(recall_score(preds, dataset_orig['test']['y'], average='macro'))

Predicting: 61it [00:00, ?it/s]

0.9739808798694793


### inception

In [95]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_recall', mode='max', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval='epoch')
printer = PrintMetricsCallback(metrics=['valid_recall'])

trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor, printer], 
    max_epochs=25, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [96]:
pl_model = StackInception(
    d_model=32, 
    d_head=128, 
    num_head_layers=1,
    dropout=0, 
    activation='relu', 
    lr=0.0001, wd=0, gamma=0.99)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [97]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | models       | ModuleList       | 1.6 M 
1 | act          | ReLU             | 0     
2 | head         | MLP              | 17.4 K
3 | criterion    | CrossEntropyLoss | 0     
4 | train_recall | Recall           | 0     
5 | valid_recall | Recall           | 0     
--------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.283     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-92
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

epoch: -1
valid_recall: 0.1484375
--------------------------------------------------------------------------------


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

epoch: 0
valid_recall: 0.6459627151489258
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 1
valid_recall: 0.6045548915863037
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 2
valid_recall: 0.7867494821548462
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 3
valid_recall: 0.4875776469707489
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 4
valid_recall: 0.5879917144775391
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 5
valid_recall: 0.784679114818573
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 6
valid_recall: 0.8840579986572266
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 7
valid_recall: 0.8995859026908875
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 8
valid_recall: 0.8312629461288452
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 9
valid_recall: 0.8840579986572266
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 10
valid_recall: 0.8157349824905396
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 11
valid_recall: 0.9016563296318054
--------------------------------------------------------------------------------


Validation: 0it [00:00, ?it/s]

epoch: 12
valid_recall: 0.8726708292961121
--------------------------------------------------------------------------------


## submission

In [28]:
# TODO
# initizlize paramters
# run maaaany epochs with dropout
# flatten instead of pooling
# cosine lr scheduler

# try early stopping
# try to include geographical information
# im not gonna lose!

In [50]:
pl_model = StackTransformer.load_from_checkpoint('.neptune/None/version_None/checkpoints/best-v7.ckpt')

In [68]:
preds = trainer.predict(pl_model, pl_data)

Predicting: 61it [00:00, ?it/s]

In [69]:
submission = pd.read_csv('sample_solution.csv')
submission['crop'] = torch.cat(preds).argmax(1)

In [70]:
submission.to_csv('submission_v97654.csv', index=False)

In [None]:
# valid: 0.982401
# lb: 0.980064

In [None]:
# valid: 0.983436
# lb: 0.978759