In [1]:
! pip install einops



In [22]:
import os
import torch
import tqdm
import numpy as np
import pandas as pd
import torch as pt
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torcheval.metrics.functional import binary_f1_score
from PIL import Image
import torch.nn.functional as F
from einops.layers.torch import Rearrange
from einops import rearrange, repeat
import time
import logging
from pathlib import Path

In [3]:
path = Path("output") / time.strftime('%Y-%m-%d_%H%M', time.localtime())
path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger()
fhandler = logging.FileHandler(filename=path / 'log.txt', mode='a')
logger.setLevel(logging.INFO)
logger.addHandler(fhandler)

In [4]:
hp = {
    "learning_rate": 0.00025,
    "num_epochs": 30,
    "positive_weight_factor": 1.0,
    "INITIAL_SEED": 113,
    "test_batch_size": 1,
    "train_batch_size": 64,
    "num_classes": 11255, # max 11255
    "vit_path": None, #"models/multimodal-vit/Model.pth",
    "data_root": "data" #"/kaggle/input"
    "train_val_split": 1.0
}

In [5]:
logger.info(f"Hyperparameters: {hp}")

When reading data, different **fusion** and **normalization** will be performed for different types of data.

In [6]:
class CustomDataset(Dataset):
    def __init__(self, subset):
        self.subset = subset
        self.metadata_path = f"{hp['data_root']}/geolifeclef-2024/GLC24_PA_metadata_{self.subset}.csv"
        self.metadata = pd.read_csv(self.metadata_path)
        self.transform = transforms.Compose([transforms.Resize((128,128)), transforms.ToTensor()])
        self.merge_key = 'surveyId'

        self.metadata_data = self.Norm(self.process_metadata())
        self.climate_data = self.Norm_all(self.process_climate())
        self.landsat_data = self.process_landsat()
        self.elevation_data = self.Norm(self.process_elevation())
        self.human_footprint_data = self.Norm(self.process_human_footprint())
        self.landcover_data = self.Norm(self.process_landcover())
        self.soilgrids_data = self.Norm(self.process_soilgrids())
        self.metadata_data = torch.cat((self.metadata_data, self.elevation_data, self.human_footprint_data, self.landcover_data, self.soilgrids_data), dim=1)

    def process_metadata(self):
        metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True).copy()
        metadata.fillna(0,inplace=True)
        metadata.replace({float('-inf'): 0}, inplace=True)
        return metadata.iloc[:,:5]

    def process_climate(self):
        climate_average = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Climate/Average 1981-2010/GLC24-PA-{self.subset}-bioclimatic.csv")
        climate_monthly = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Climate/Monthly/GLC24-PA-{self.subset}-bioclimatic_monthly.csv")
        climate = pd.merge(climate_average, climate_monthly, on=self.merge_key)
        climate.fillna(climate.mean(),inplace=True)
        return climate

    def process_landsat(self):
        landsat_types = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
        landsat_dfs = []
        for landsat_type in landsat_types:
            landsat = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/PA-{self.subset}-landsat_time_series/GLC24-PA-{self.subset}-landsat_time_series-{landsat_type}.csv")
            landsat.fillna(landsat.mean(),inplace=True)
            landsat_dfs.append(landsat)
        return torch.cat([self.Norm_all(landsat_df) for landsat_df in landsat_dfs],axis=1)

    def process_elevation(self):
        elevation = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-{self.subset}-elevation.csv")
        elevation[elevation<0]=0
        elevation.fillna(elevation.mean(),inplace=True)
        return elevation

    def process_human_footprint(self):
        human_footprint = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-{self.subset}-human_footprint.csv")
        human_footprint[human_footprint<0]=0
        human_footprint.fillna(human_footprint.mean(),inplace=True)
        return human_footprint

    def process_landcover(self):
        landcover = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-{self.subset}-landcover.csv")
        landcover[landcover<0]=0
        landcover.fillna(landcover.mean(),inplace=True)
        return landcover

    def process_soilgrids(self):
        soilgrids = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-{self.subset}-soilgrids.csv")
        soilgrids[soilgrids<0]=0
        soilgrids.fillna(soilgrids.mean(),inplace=True)
        return soilgrids

    def Norm(self,df):
        output=torch.from_numpy(df.iloc[:,1:].values).float()
        return (output-output.mean(dim=0))/output.std(dim=0)

    def Norm_all(self,df):
        output=torch.from_numpy(df.iloc[:,1:].values).float()
        return (output-output.mean())/output.std()

    def patch_rgb_path(self,survey_id):
        path = f"{hp['data_root']}/geolifeclef-2024/PA_{self.subset.title()}_SatellitePatches_RGB/pa_{self.subset}_patches_rgb"
        for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
            path = os.path.join(path, d)
        path = os.path.join(path, f"{survey_id}.jpeg")
        return path

    def patch_nir_path(self,survey_id):
        path = f"{hp['data_root']}/geolifeclef-2024/PA_{self.subset.title()}_SatellitePatches_NIR/pa_{self.subset}_patches_nir"
        for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
            path = os.path.join(path, d)
        path = os.path.join(path, f"{survey_id}.jpeg")
        return path

    def __len__(self):
        return len(self.metadata_data)

    def __getitem__(self, idx):
        survey_id = self.metadata.surveyId[idx]
        image_path = self.patch_rgb_path(survey_id)
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)
        image = image.unsqueeze(0)
        image_nir_path = self.patch_nir_path(survey_id)
        nir_image = Image.open(image_nir_path).convert("L")
        nir_image = self.transform(nir_image)
        nir_image = nir_image.unsqueeze(0)
        image_data = torch.cat([image,nir_image],dim=1)
        image_data = torch.squeeze(image_data)
        sample=[self.metadata_data[idx,:],image_data,self.landsat_data[idx,:],self.climate_data[idx,:]]
        return sample, survey_id

In [7]:
class TestDataset(CustomDataset):
    def __init__(self):
        super().__init__(subset="test")

    def __getitem__(self, idx):
        sample, survey_id = super().__getitem__(idx)
        return sample, survey_id

In [8]:
class TrainDataset(CustomDataset):
    def __init__(self):
        super().__init__(subset="train")
        labels = self.metadata[['surveyId' ,'speciesId']].astype(int).copy()
        self.label_dict = labels.groupby('surveyId')['speciesId'].apply(list).to_dict()


    def __getitem__(self, idx):
        sample, survey_id = super().__getitem__(idx)
        species_ids = self.label_dict[survey_id]  # Get list of species IDs for the survey ID
        label = torch.zeros(hp['num_classes'])  # Initialize label tensor
        for species_id in species_ids:
            label_id = species_id
            label[label_id] = 1  # Set the corresponding class index to 1 for each species ID
        count = len(species_ids)
        return sample, survey_id, label, count

In [9]:
if hp['train_val_split'] == 1.0:
    train_dataset = TrainDataset()
    test_dataset = TestDataset()
    dataloaders = {'train': DataLoader(train_dataset, batch_size=hp['train_batch_size'], shuffle=False, num_workers=1),
                     'test': DataLoader(test_dataset, batch_size=hp['test_batch_size'], shuffle=False, num_workers=1)}
else:
    train_dataset, val_dataset = torch.utils.data.random_split(TrainDataset(), [hp['train_val_split'], 1-hp['train_val_split']])
    test_dataset = TestDataset()

    dataloaders = {'train': DataLoader(train_dataset, batch_size=hp['train_batch_size'], shuffle=False, num_workers=1),
                'val': DataLoader(val_dataset, batch_size=hp['test_batch_size'], shuffle=False, num_workers=1),
                'test': DataLoader(test_dataset, batch_size=hp['test_batch_size'], shuffle=False, num_workers=1)}

This is an MLP used to extract features from generally independent information.

In [10]:
class Embedding(nn.Module):
    def __init__(self, dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(dim, out_dim*5)
        self.fc2 = nn.Linear(out_dim*5, out_dim)
        self.norm = nn.LayerNorm(out_dim*5)

    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = self.norm(x)
        x = self.fc2(x)
        return x

The following is the part of ViT, which can also be considered as the Encoder part of **Transformer**.

In [11]:
class Multihead_self_attention(nn.Module):
    def __init__(self, heads, head_dim, dim):
        super().__init__()
        self.head_dim = head_dim
        self.heads = heads
        self.inner_dim = self.heads*self.head_dim
        self.scale = self.head_dim**-0.5
        self.to_qkv = nn.Linear(dim, self.inner_dim*3)
        self.to_output = nn.Linear(self.inner_dim, dim)
        self.norm = nn.LayerNorm(dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        Q, K, V = map(lambda t: rearrange(t, 'b l (h dim) -> b h l dim', dim=self.head_dim), qkv)
        K_T = K.transpose(-1, -2)
        att_score = Q@K_T*self.scale
        att = self.softmax(att_score)
        out = att@V   # (B,H,L,dim)
        out = rearrange(out, 'b h l dim -> b l (h dim)')
        output = self.to_output(out)
        return output

In [12]:
class FeedForward(nn.Module):
    def __init__(self, dim, mlp_dim):
        super().__init__()
        self.fc1 = nn.Linear(dim, mlp_dim)
        self.fc2 = nn.Linear(mlp_dim, dim)
        self.norm = nn.LayerNorm(dim)

    def forward(self, x):
        x = self.norm(x)
        x = F.gelu(self.fc1(x))
        x = self.fc2(x)
        return x

In [13]:
class Transformer_block(nn.Module):
    def __init__(self, dim, heads, head_dim, mlp_dim):
        super().__init__()
        self.MHA = Multihead_self_attention(heads=heads, head_dim=head_dim, dim=dim)
        self.FeedForward = FeedForward(dim=dim, mlp_dim=mlp_dim)

    def forward(self, x):
        x = self.MHA(x)+x
        x = self.FeedForward(x)+x
        return x

In [14]:
class ViT(nn.Module):
    def __init__(self, dim, heads, head_dim, mlp_dim, num_class):
        super().__init__()
        self.transformer = Transformer_block(dim=dim, heads=heads, head_dim=head_dim, mlp_dim=mlp_dim)

        self.MLP_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_class)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.transformer(x)
        CLS_token = x[:, 0, :]
        out = self.MLP_head(CLS_token)
        return out

The following is **CNN**, used to extract feature information from images.

In [15]:
class ResNet18(nn.Module):
    def __init__(self, num_classes):
        super(ResNet18, self).__init__()

        self.resnet18 = models.resnet18(weights=None)
        self.resnet18.conv1 = nn.Conv2d(4, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc1 = nn.Linear(1000, num_classes)

    def forward(self, x):
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc1(x)
        return x

This is the **final multi-modal model**. Unlike ViT, its input is features extracted from each dimension. 

For features with time information, I added additional position information. 

At the same time, I also added a feature that is a fusion of the first four features.

In [16]:
class MultiModal(nn.Module):
    def __init__(self, num_classes):
        super(MultiModal, self).__init__()
        self.cls = nn.Parameter(torch.randn(1, 1, 200))
        self.meta = Embedding(31,200)
        self.resnet18 = ResNet18(200)
        self.landsat = Embedding(504,200)
        self.position_landsat = nn.Parameter(torch.randn(1, 504))
        self.climate = Embedding(931,200)
        self.position_climate = nn.Parameter(torch.randn(1, 931))
        self.emb = Embedding(800,200)
        self.position_combine = nn.Parameter(torch.randn(1, 800))
        self.vit = ViT(200, 2, 200, 400, num_classes)
        self.position = nn.Parameter(torch.randn(1, 6, 200))

    def forward(self, x):
        batch = x[0].size(0)
        CLS = repeat(self.cls, '1 1 d -> b 1 d', b=batch).to(device)
        META = self.meta(x[0])
        IMG = self.resnet18(x[1])
        LANDSAT = self.landsat(x[2]+self.position_landsat)
        CLIMATE = self.climate(x[3]+self.position_climate)
        combine = torch.cat((META, IMG, LANDSAT, CLIMATE), dim=1)
        COMBINE = self.emb(combine+self.position_combine)
        token = torch.concat((CLS, META.unsqueeze(1)), dim=1)
        token = torch.concat((token, IMG.unsqueeze(1)), dim=1)
        token = torch.concat((token, LANDSAT.unsqueeze(1)), dim=1)
        token = torch.concat((token, CLIMATE.unsqueeze(1)), dim=1)
        token = torch.concat((token, COMBINE.unsqueeze(1)), dim=1)
        out = self.vit(token+self.position)
        return out

In [17]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    logger.info("DEVICE = CUDA")

model = MultiModal(hp['num_classes']).to(device)

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=hp['learning_rate'])
logger.info("Optimizer: AdamW")
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)
logger.info("Scheduler: CosineAnnealingLR")



In [19]:
print(f"Training for {hp['num_epochs']} epochs started.")
start_time = time.time()

phases = ['train', 'val'] if hp['train_val_split'] < 1.0 else ['train']

for epoch in range(hp['num_epochs']):
    for phase in phases:
        if phase == 'train':
            model.train()
        else:
            model.eval()

        f1_scores_k = torch.zeros(len(dataloaders['val']))
        f1_scores_25 = torch.zeros(len(dataloaders['val']))

        for batch_idx, (sample, survey_id, labels, count) in enumerate(dataloaders[phase]):
            samples = [tensor.to(device) for tensor in sample]
            survey_id = survey_id.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(samples)
                pos_weight = labels*hp['positive_weight_factor']  # All positive weights are equal to 10
                criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

            # if phase == 'train':
                # if batch_idx % (len(dataloaders[phase])//10) == 0:
                    # print(f"Epoch {epoch+1}/{hp['num_epochs']}, Batch {batch_idx}/{len(dataloaders[phase])}, Loss: {loss.item()}")
            # else:
            if phase == 'val':
                topk_count = outputs.squeeze().topk(count).indices
                y_pred_k = torch.zeros(hp['num_classes']).to(device)
                y_pred_k[topk_count] = 1
                f1_scores_k[batch_idx] = binary_f1_score(y_pred_k, labels.squeeze())

                top25_count = outputs.squeeze().topk(25).indices
                y_pred_25 = torch.zeros(hp['num_classes']).to(device)
                y_pred_25[top25_count] = 1
                f1_scores_25[batch_idx] = binary_f1_score(y_pred_25, labels.squeeze())


        if phase == 'train':
            scheduler.step()
            logger.info("Scheduler:",scheduler.state_dict())
        else:
            logger.info(f"Epoch {epoch+1}/{hp['num_epochs']}, F1 Score (topk): {f1_scores_k.mean()}, F1 Score (top25): {f1_scores_25.mean()}")

logger.info(f"Training time: {(time.time()-start_time)/60:.0f} minutes")
# Save the trained model
logger.info(model.eval())
torch.save(model.state_dict(), path / "multimodal.pth")

Training for 30 epochs started.


In [20]:
with torch.no_grad():
    surveys = []
    full_indices = []
    for data, surveyID in tqdm.tqdm(dataloaders['test'], total=len(dataloaders['test'])):

        data = [tensor.to(device) for tensor in data]

        outputs = model(data)
        predictions = torch.sigmoid(outputs).cpu().numpy()
        predictions = np.squeeze(predictions)
        full_indices.append(predictions)
        surveys.extend(surveyID.cpu().numpy())

pd.DataFrame(full_indices).add_prefix("speciesId_").to_pickle(path / "total_output.pkl")

100%|██████████| 4716/4716 [00:28<00:00, 168.27it/s]


In [23]:
top_indices = []

counts = pd.read_pickle('counts.pkl')

data_concatenated = [' '.join(map(str, pt.from_numpy(indices).topk(max(1,count)).indices.sort().values.numpy())) for indices, count in zip(full_indices, counts)]

pd.DataFrame(
    {'surveyId': surveys,
     'predictions': data_concatenated,
    }).to_csv(path / "submission.csv", index = False)