In [1]:
import os, datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import geopandas as gpd
import geojson
import shapely
import shapely.geometry

import xarray as xr
import rioxarray as rxr

In [2]:
import matplotlib.pyplot as plt
import contextily

In [3]:
from src import *

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = read_data('data/train_dataset_train_2.csv')
data_test = read_data('data/test_dataset_test_2.csv')
data_ts, data_id = process_data(data)
data_ts_test, data_id_test = process_data(data_test)

In [6]:
def add_diff(data_ts):
    diff = data_ts.T.diff()[1:].T
    diff.columns = [f'{x}_' for x in diff.columns]
    return pd.concat((data_ts, diff), axis=1)

In [7]:
# data_ts = add_diff(data_ts)
# data_ts_test = add_diff(data_ts_test)

In [8]:
# dates = pd.date_range(start=None, end='2021/08/27', periods=135)
# data_ts_modis = pd.DataFrame(np.load('data_modis.npz')['arr_0'].squeeze(-1), columns=dates)
# data_ts_modis_test = pd.DataFrame(np.load('data_modis_test.npz')['arr_0'].squeeze(-1), columns=dates)

In [9]:
data_ts_modis = pd.read_csv('data/train_dataset_modis.csv').fillna(0.0)
data_ts_modis_test = pd.read_csv('data/test_dataset_modis.csv').fillna(0.0)

In [10]:
# data_ts_modis = add_diff(data_ts_modis)
# data_ts_modis_test = add_diff(data_ts_modis_test)

In [11]:
data_ts_landsat = pd.read_csv('data/train_dataset_landsat.csv').fillna(0)
data_ts_landsat_test = pd.read_csv('data/test_dataset_landsat.csv').fillna(0)

In [12]:
# data_ts_landsat = add_diff(data_ts_landsat)
# data_ts_landsat_test = add_diff(data_ts_landsat_test)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [44]:
def get_dataset(data_ts, data_id, data_test_ts, data_test_id):
    data_ts_train, data_ts_val, data_id_train, data_id_val = train_test_split(data_ts, data_id, test_size=0.1, random_state=1)
    # data_train = pd.concat((data_ts_train, data_id_train[['area', 'lat', 'lon']]), axis=1)
    # data_val = pd.concat((data_ts_val, data_id_val[['area', 'lat', 'lon']]), axis=1)
    # data_test = pd.concat((data_ts_test, data_id_test[['area', 'lat', 'lon']]), axis=1)
    data_train = data_ts_train
    data_val = data_ts_val
    data_test = data_test_ts

    dataset = dict()
    dataset['full'] = {'X': data_ts, 'y': data_id['crop']}
    dataset['train'] = {'X': data_train, 'y': data_id_train['crop']}
    dataset['val'] = {'X': data_val, 'y': data_id_val['crop']}
    dataset['test'] = {'X': data_test}
    return dataset

In [45]:
dataset_orig = get_dataset(data_ts, data_id, data_ts_test, data_id_test)
dataset_modis = get_dataset(data_ts_modis, data_id, data_ts_modis_test, data_id_test)
dataset_landsat = get_dataset(data_ts_landsat, data_id, data_ts_landsat_test, data_id_test)
dataset_concat = get_dataset(
    pd.concat((data_ts, data_ts_modis, data_ts_landsat), axis=1), data_id, 
    pd.concat((data_ts_test, data_ts_modis_test, data_ts_landsat_test), axis=1), data_id_test)

# sklearn

In [245]:
SEED = 1

In [246]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [247]:
rf = RandomForestClassifier(n_estimators=200, random_state=SEED, verbose=False)
rf.fit(**dataset_orig['train'])
preds = rf.predict(dataset_orig['val']['X'])
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.9603060360196845

In [248]:
rf_ = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_.fit(**dataset_modis['train'])
preds = rf_.predict(dataset_modis['val']['X'])
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.5986918472170635

In [249]:
rf_ls = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf_ls.fit(**dataset_landsat['train'])
preds = rf_ls.predict(dataset_landsat['val']['X'])
recall_score(preds, dataset_landsat['val']['y'], average='macro')

0.6953205149240029

In [250]:
preds = (
    0.8*rf.predict_proba(dataset_orig['val']['X']) + 
    0.1*rf_.predict_proba(dataset_modis['val']['X']) +
    0.1*rf_ls.predict_proba(dataset_landsat['val']['X'])
).argmax(axis=1)

In [251]:
recall_score(preds, dataset_modis['val']['y'], average='macro')

0.9645251067787923

In [255]:
rf = RandomForestClassifier(
    n_estimators=200, random_state=SEED, verbose=False,
    min_samples_leaf=1)
rf.fit(**dataset_concat['train'])
preds = rf.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9638116184726355

In [256]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(**dataset_concat['train'])
preds = gb.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

0.9633086367457507

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=500,
    learning_rate=1,
    # depth=10,
    random_seed=1,
    verbose=False
)
model.fit(**dataset_concat['train'])
preds = model.predict(dataset_concat['val']['X'])
recall_score(preds, dataset_concat['val']['y'], average='macro')

In [186]:
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.6443026399804246

In [15]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

In [16]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=10)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.771540088080925

In [17]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=20)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.792783539714341

In [23]:
model = KNeighborsTimeSeriesClassifier(n_neighbors=100)
model.fit(dataset_orig['train']['X'].values, dataset_orig['train']['y'])
preds = model.predict(dataset_orig['val']['X'].values)
recall_score(preds, dataset_orig['val']['y'], average='macro')

0.7834449463857383

# neural

In [16]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import NeptuneLogger

In [46]:
with open('.api_key') as f:
    API_KEY = f.read()

In [47]:
from src import StackRNN, StackTransformer, StackDataset, StackDataModule

In [48]:
g = torch.Generator()
g.manual_seed(1)
        
dataset = StackDataset(
    dataset_orig['full']['X'], dataset_modis['full']['X'], dataset_landsat['full']['X'], 
    y=dataset_orig['full']['y'])
train_dataset, val_dataset = random_split(dataset, lengths=[int(len(dataset)*0.8), int(len(dataset)*0.2)], generator=g)
test_dataset = StackDataset(dataset_orig['test']['X'], dataset_modis['test']['X'], dataset_landsat['test']['X'], y=None)

### RNN

In [105]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_loss', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval ='epoch')

In [113]:
trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [107]:
pl_model = StackRNN(hidden_size=64, layers=2, dropout=0.5, lr=0.01, wd=0, gamma=0.9)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [108]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | rnn1         | GRU              | 100 K 
1 | rnn2         | GRU              | 100 K 
2 | rnn3         | GRU              | 100 K 
3 | fc           | Sequential       | 67.6 K
4 | act          | ReLU             | 0     
5 | criterion    | CrossEntropyLoss | 0     
6 | train_recall | Recall           | 0     
7 | valid_recall | Recall           | 0     
--------------------------------------------------
368 K     Trainable params
0         Non-trainable params
368 K     Total params
1.473     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-50
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Transformer

In [21]:
best_checkpointer = ModelCheckpoint(save_top_k=1, save_last=True, monitor='valid_loss', filename='best')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project='fant0md/aihacks-2022-fields')
lr_monitor = LearningRateMonitor(logging_interval ='epoch')

trainer = pl.Trainer(
    log_every_n_steps=1, 
    logger=neptune_logger, 
    callbacks=[best_checkpointer, lr_monitor], 
    max_epochs=50, 
    accelerator='auto',
    devices=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
pl_model = StackTransformer(
    d_model=64, 
    nhead=1, 
    dim_feedforward=64, 
    d_head=128, 
    num_layers=2, 
    num_head_layers=1, 
    dropout=0, 
    activation="relu",
    lr=0.0001, wd=0, gamma=0.99)
pl_data = StackDataModule(train_dataset, val_dataset, test_dataset, batch_size=64)

In [23]:
trainer.fit(pl_model, pl_data)


  | Name         | Type             | Params
--------------------------------------------------
0 | model1       | TST              | 628 K 
1 | model2       | TST              | 1.2 M 
2 | model3       | TST              | 198 K 
3 | act          | ReLU             | 0     
4 | head         | MLP              | 17.4 K
5 | criterion    | CrossEntropyLoss | 0     
6 | train_recall | Recall           | 0     
7 | valid_recall | Recall           | 0     
--------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.168     Total estimated model params size (MB)


https://app.neptune.ai/fant0md/aihacks-2022-fields/e/AIH-67


Info (NVML): NVML Shared Library Not Found. GPU usage metrics may not be reported. For more information, see https://docs.neptune.ai/you-should-know/what-can-you-log-and-display#hardware-consumption


Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

## submission

In [64]:
preds = trainer.predict(pl_model, pl_data)

Predicting: 61it [00:00, ?it/s]

In [66]:
submission = pd.read_csv('sample_solution.csv')
submission['crop'] = torch.cat(preds).argmax(1)

In [68]:
submission.to_csv('submssion.csv', index=False)