# This is an INFERENCE using the model combine the method of MLP, CNN and Transformer

Thanks PICEKL for his baseline, which help me a lot.

The main idea of this method is to obtain multiple features from multiple dimensions. We simply divide it into four types: **image information** with spatial characteristics, **meta information** that is independent of each other, **climate information** and **satellite information** with time characteristics.

In [1]:
import os
import time
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import logging
from pathlib import Path
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import precision_recall_fscore_support
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from einops.layers.torch import Rearrange
from einops import rearrange, repeat
from models2 import MLP, ViT, ResNet18

  warn(


In [2]:
path = Path("output") / time.strftime('%Y-%m-%d_%H%M', time.localtime())
path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger()
fhandler = logging.FileHandler(filename=path / 'log.txt', mode='a')
logger.setLevel(logging.INFO)
logger.addHandler(fhandler)

In [3]:
hp = {
    "learning_rate": 0.00025,
    "num_epochs": 5,
    "positive_weight_factor": 1.0,
    "INITIAL_SEED": 113,
    "test_batch_size": 1,
    "train_batch_size": 64,
    "num_classes": 11255, # max 11255
    "data_root": "data", #"/kaggle/input"
    "train_val_split": 0.8, # the fraction of the data used for training e.g. 0.8
    "vit_path": None, # "output/2024-05-21_1613",
    "data_path": "output/2024-05-21_1613",
}
vit_path = None if hp["vit_path"] is None else Path(hp["vit_path"])
data_path = None if hp["data_path"] is None else Path(hp["data_path"])

In [4]:
logger.info(f"Hyperparameters: {hp}")

In [5]:
num_classes = 11255

When reading data, different **fusion** and **normalization** will be performed for different types of data.

In [6]:
class CustomDataset(Dataset):
    def __init__(self, subset):
        self.subset = subset
        self.metadata_path = f"{hp['data_root']}/geolifeclef-2024/GLC24_PA_metadata_{self.subset}.csv"
        self.metadata = pd.read_csv(self.metadata_path)
        self.transform = transforms.Compose([transforms.Resize((128,128)), transforms.ToTensor()])
        self.merge_key = 'surveyId'

        self.metadata_data = self.Norm(self.process_metadata())
        self.climate_data = self.Norm_all(self.process_climate())
        self.landsat_data = self.process_landsat()
        self.elevation_data = self.Norm(self.process_elevation())
        self.human_footprint_data = self.Norm(self.process_human_footprint())
        self.landcover_data = self.Norm(self.process_landcover())
        self.soilgrids_data = self.Norm(self.process_soilgrids())
        self.metadata_data = torch.cat((self.metadata_data, self.elevation_data, self.human_footprint_data, self.landcover_data, self.soilgrids_data), dim=1)

    def process_metadata(self):
        metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True).copy()
        metadata.fillna(0,inplace=True)
        metadata.replace({float('-inf'): 0}, inplace=True)
        return metadata.iloc[:,:5]

    def process_climate(self):
        climate_average = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Climate/Average 1981-2010/GLC24-PA-{self.subset}-bioclimatic.csv")
        climate_monthly = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Climate/Monthly/GLC24-PA-{self.subset}-bioclimatic_monthly.csv")
        climate = pd.merge(climate_average, climate_monthly, on=self.merge_key)
        climate.fillna(climate.mean(),inplace=True)
        return climate

    def process_landsat(self):
        landsat_types = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
        landsat_dfs = []
        for landsat_type in landsat_types:
            landsat = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/PA-{self.subset}-landsat_time_series/GLC24-PA-{self.subset}-landsat_time_series-{landsat_type}.csv")
            landsat.fillna(landsat.mean(),inplace=True)
            landsat_dfs.append(landsat)
        return torch.cat([self.Norm_all(landsat_df) for landsat_df in landsat_dfs],axis=1)

    def process_elevation(self):
        elevation = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-{self.subset}-elevation.csv")
        elevation[elevation<0]=0
        elevation.fillna(elevation.mean(),inplace=True)
        return elevation

    def process_human_footprint(self):
        human_footprint = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-{self.subset}-human_footprint.csv")
        human_footprint[human_footprint<0]=0
        human_footprint.fillna(human_footprint.mean(),inplace=True)
        return human_footprint

    def process_landcover(self):
        landcover = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-{self.subset}-landcover.csv")
        landcover[landcover<0]=0
        landcover.fillna(landcover.mean(),inplace=True)
        return landcover

    def process_soilgrids(self):
        soilgrids = pd.read_csv(f"{hp['data_root']}/geolifeclef-2024/EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-{self.subset}-soilgrids.csv")
        soilgrids[soilgrids<0]=0
        soilgrids.fillna(soilgrids.mean(),inplace=True)
        return soilgrids

    def Norm(self,df):
        output=torch.from_numpy(df.iloc[:,1:].values).float()
        return (output-output.mean(dim=0))/output.std(dim=0)

    def Norm_all(self,df):
        output=torch.from_numpy(df.iloc[:,1:].values).float()
        return (output-output.mean())/output.std()

    def patch_rgb_path(self,survey_id):
        path = f"{hp['data_root']}/geolifeclef-2024/PA_{self.subset.title()}_SatellitePatches_RGB/pa_{self.subset}_patches_rgb"
        for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
            path = os.path.join(path, d)
        path = os.path.join(path, f"{survey_id}.jpeg")
        return path

    def patch_nir_path(self,survey_id):
        path = f"{hp['data_root']}/geolifeclef-2024/PA_{self.subset.title()}_SatellitePatches_NIR/pa_{self.subset}_patches_nir"
        for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
            path = os.path.join(path, d)
        path = os.path.join(path, f"{survey_id}.jpeg")
        return path

    def __len__(self):
        return len(self.metadata_data)

    def __getitem__(self, idx):
        survey_id = self.metadata.surveyId[idx]
        image_path = self.patch_rgb_path(survey_id)
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)
        image = image.unsqueeze(0)
        image_nir_path = self.patch_nir_path(survey_id)
        nir_image = Image.open(image_nir_path).convert("L")
        nir_image = self.transform(nir_image)
        nir_image = nir_image.unsqueeze(0)
        image_data = torch.cat([image,nir_image],dim=1)
        image_data = torch.squeeze(image_data)
        sample=[self.metadata_data[idx,:],image_data,self.landsat_data[idx,:],self.climate_data[idx,:]]
        return sample, survey_id

In [7]:
class TestDataset(CustomDataset):
    def __init__(self):
        super().__init__(subset="test")

    def __getitem__(self, idx):
        sample, survey_id = super().__getitem__(idx)
        return sample, survey_id

In [8]:
class TrainDataset(CustomDataset):
    def __init__(self):
        super().__init__(subset="train")
        labels = self.metadata[['surveyId' ,'speciesId']].astype(int).copy()
        self.label_dict = labels.groupby('surveyId')['speciesId'].apply(list).to_dict()


    def __getitem__(self, idx):
        sample, survey_id = super().__getitem__(idx)
        species_ids = self.label_dict[survey_id]  # Get list of species IDs for the survey ID
        label = torch.zeros(hp['num_classes'])  # Initialize label tensor
        for species_id in species_ids:
            label_id = species_id
            label[label_id] = 1  # Set the corresponding class index to 1 for each species ID
        count = len(species_ids)
        return sample, survey_id, label, count

In [9]:
# Dataset and DataLoader
batch_size = 1
# transform = transforms.Compose([
#     transforms.Resize((128,128)),
#     transforms.ToTensor()
# ])

# Load Training metadata
test_dataset = TestDataset()
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=1)

train_dataset = TrainDataset()
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=1)

This is an MLP used to extract features from generally independent information.

In [10]:
class Embedding(nn.Module):
    def __init__(self, dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(dim, out_dim*5)
        self.fc2 = nn.Linear(out_dim*5, out_dim)
        self.norm = nn.LayerNorm(out_dim*5)

    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = self.norm(x)
        x = self.fc2(x)
        return x

This is the **final multi-modal model**. Unlike ViT, its input is features extracted from each dimension. 

For features with time information, I added additional position information. 

At the same time, I also added a feature that is a fusion of the first four features.

In [11]:
class MutiModal(nn.Module):
    def __init__(self, num_classes):
        super(MutiModal, self).__init__()
        self.cls = nn.Parameter(torch.randn(1, 1, 200))
        self.meta = Embedding(31,200)
        self.resnet18 = ResNet18(200)
        self.landsat = Embedding(504,200)
        self.position_landsat = nn.Parameter(torch.randn(1, 504))
        self.climate = Embedding(931,200)
        self.position_climate = nn.Parameter(torch.randn(1, 931))
        self.emb = Embedding(800,200)
        self.position_combine = nn.Parameter(torch.randn(1, 800))
        self.vit = ViT(200, 2, 200, 400, num_classes)
        self.position = nn.Parameter(torch.randn(1, 6, 200))

    def forward(self, x):
        batch = x[0].size(0)
        CLS = repeat(self.cls, '1 1 d -> b 1 d', b=batch).to(device)
        META = self.meta(x[0])
        IMG = self.resnet18(x[1])
        LANDSAT = self.landsat(x[2]+self.position_landsat)
        CLIMATE = self.climate(x[3]+self.position_climate)
        combine = torch.cat((META, IMG, LANDSAT, CLIMATE), dim=1)
        COMBINE = self.emb(combine+self.position_combine)
        token = torch.concat((CLS, META.unsqueeze(1)), dim=1)
        token = torch.concat((token, IMG.unsqueeze(1)), dim=1)
        token = torch.concat((token, LANDSAT.unsqueeze(1)), dim=1)
        token = torch.concat((token, CLIMATE.unsqueeze(1)), dim=1)
        token = torch.concat((token, COMBINE.unsqueeze(1)), dim=1)
        out = self.vit(token+self.position)
        return out

In [12]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

model = MutiModal(num_classes).to(device)
# model.load_state_dict(torch.load("models/vit/Model.pth", map_location=device))

DEVICE = CUDA


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=hp['learning_rate'])
logger.info("Optimizer: AdamW")
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)
logger.info("Scheduler: CosineAnnealingLR")



In [14]:
if hp['vit_path'] is not None:
    model.load_state_dict(torch.load(vit_path / "multimodal.pth"))
    logger.info(f"Model loaded: {vit_path / 'multimodal.pth'}")
else:
    print(f"Training for {hp['num_epochs']} epochs started.")
    start_time = time.time()

    for epoch in tqdm(range(hp['num_epochs'])):
        model.train()

        for batch_idx, (sample, survey_id, labels, count) in enumerate(train_loader):
            samples = [tensor.to(device) for tensor in sample]
            survey_id = survey_id.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                outputs = model(samples)

                pos_weight = labels*hp['positive_weight_factor']  # All positive weights are equal to 10
                criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            if batch_idx % (len(train_loader)//10) == 0:
                print(f"Epoch {epoch+1}/{hp['num_epochs']}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}")

        scheduler.step()
        logger.info("Scheduler:",scheduler.state_dict())

    logger.info(f"Training time: {(time.time()-start_time)/60:.0f} minutes")
    # Save the trained model
    logger.info(model.eval())
    torch.save(model.state_dict(), path / "multimodal.pth")

Training for 5 epochs started.


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5, Batch 0/1391, Loss: 0.7319745421409607
Epoch 1/5, Batch 139/1391, Loss: 0.03292905539274216
Epoch 1/5, Batch 278/1391, Loss: 0.016077920794487
Epoch 1/5, Batch 417/1391, Loss: 0.011257714591920376
Epoch 1/5, Batch 556/1391, Loss: 0.011127620935440063
Epoch 1/5, Batch 695/1391, Loss: 0.00969545729458332
Epoch 1/5, Batch 834/1391, Loss: 0.009522094391286373
Epoch 1/5, Batch 973/1391, Loss: 0.008371397852897644
Epoch 1/5, Batch 1112/1391, Loss: 0.009551092982292175
Epoch 1/5, Batch 1251/1391, Loss: 0.009219029918313026


 20%|██        | 1/5 [13:00<52:02, 780.68s/it]

Epoch 1/5, Batch 1390/1391, Loss: 0.009186108596622944
Epoch 2/5, Batch 0/1391, Loss: 0.008382325060665607
Epoch 2/5, Batch 139/1391, Loss: 0.008062752895057201
Epoch 2/5, Batch 278/1391, Loss: 0.007518880534917116
Epoch 2/5, Batch 417/1391, Loss: 0.007663054391741753
Epoch 2/5, Batch 556/1391, Loss: 0.009462274610996246
Epoch 2/5, Batch 695/1391, Loss: 0.007674806285649538
Epoch 2/5, Batch 834/1391, Loss: 0.008438208140432835
Epoch 2/5, Batch 973/1391, Loss: 0.007647836115211248
Epoch 2/5, Batch 1112/1391, Loss: 0.007479742169380188
Epoch 2/5, Batch 1251/1391, Loss: 0.0071536884643137455


 40%|████      | 2/5 [26:02<39:03, 781.32s/it]

Epoch 2/5, Batch 1390/1391, Loss: 0.006751437671482563
Epoch 3/5, Batch 0/1391, Loss: 0.007217148318886757
Epoch 3/5, Batch 139/1391, Loss: 0.006697237957268953
Epoch 3/5, Batch 278/1391, Loss: 0.0071390969678759575
Epoch 3/5, Batch 417/1391, Loss: 0.006971823051571846
Epoch 3/5, Batch 556/1391, Loss: 0.006098734214901924
Epoch 3/5, Batch 695/1391, Loss: 0.006469473708420992
Epoch 3/5, Batch 834/1391, Loss: 0.006787928286939859
Epoch 3/5, Batch 973/1391, Loss: 0.006482400000095367
Epoch 3/5, Batch 1112/1391, Loss: 0.005802351515740156
Epoch 3/5, Batch 1251/1391, Loss: 0.005611401051282883


 60%|██████    | 3/5 [39:05<26:03, 781.95s/it]

Epoch 3/5, Batch 1390/1391, Loss: 0.005501255393028259
Epoch 4/5, Batch 0/1391, Loss: 0.006081508006900549
Epoch 4/5, Batch 139/1391, Loss: 0.005711866542696953
Epoch 4/5, Batch 278/1391, Loss: 0.005280612502247095
Epoch 4/5, Batch 417/1391, Loss: 0.0051314434967935085
Epoch 4/5, Batch 556/1391, Loss: 0.00486820749938488
Epoch 4/5, Batch 695/1391, Loss: 0.004583961330354214
Epoch 4/5, Batch 834/1391, Loss: 0.0049757882952690125
Epoch 4/5, Batch 973/1391, Loss: 0.0051056803204119205
Epoch 4/5, Batch 1112/1391, Loss: 0.004967124667018652
Epoch 4/5, Batch 1251/1391, Loss: 0.004101336933672428


 80%|████████  | 4/5 [52:08<13:02, 782.65s/it]

Epoch 4/5, Batch 1390/1391, Loss: 0.00511839147657156
Epoch 5/5, Batch 0/1391, Loss: 0.004988267086446285
Epoch 5/5, Batch 139/1391, Loss: 0.004751879256218672
Epoch 5/5, Batch 278/1391, Loss: 0.0050101024098694324
Epoch 5/5, Batch 417/1391, Loss: 0.004253692924976349
Epoch 5/5, Batch 556/1391, Loss: 0.004044508561491966
Epoch 5/5, Batch 695/1391, Loss: 0.004495963454246521
Epoch 5/5, Batch 834/1391, Loss: 0.004505382850766182
Epoch 5/5, Batch 973/1391, Loss: 0.004305401351302862
Epoch 5/5, Batch 1112/1391, Loss: 0.003823834005743265
Epoch 5/5, Batch 1251/1391, Loss: 0.0038854144513607025


100%|██████████| 5/5 [1:05:12<00:00, 782.54s/it]

Epoch 5/5, Batch 1390/1391, Loss: 0.004272107966244221





In [15]:
with torch.no_grad():
    surveys = []
    top_indices = []
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):

        data = [tensor.to(device) for tensor in data]

        outputs = model(data)
        predictions = torch.sigmoid(outputs).cpu().numpy()
        predictions = np.squeeze(predictions)
        prediction = np.argwhere(predictions>=0.95).flatten()
        top_indices.append(prediction)
        surveys.extend(surveyID.cpu().numpy())

100%|██████████| 4716/4716 [00:26<00:00, 180.80it/s]


In [16]:
data_concatenated = [' '.join(map(str, row)) for row in top_indices]

pd.DataFrame(
    {'surveyId': surveys,
     'predictions': data_concatenated,
    }).to_csv(f"submission{time.strftime('%Y-%m-%d_%H%M', time.localtime())}.csv", index = False)