## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import ViTModel, ViTFeatureExtractor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt



  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/jaimil.d/miniconda3/envs/econ424/lib/python3.11/site-packages/torchvision/image.so
  warn(


## Data Loading and Project Setup

In [2]:
# Configuration
class CFG:
    seed = 480  # Random seed
    batch_size = 32  # Batch size
    num_classes = 6  # Number of classes in the dataset
    image_size = [224, 224]  # Input image size
    epochs = 10  # Training epochs
    class_names = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
    num_classes = len(class_names)

In [3]:
torch.manual_seed(CFG.seed)


<torch._C.Generator at 0x15afb1210>

In [4]:

# Load CSV files
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
target_names = pd.read_csv('./data/target_name_meta.tsv', sep='\t')
BASE_PATH = "./data"
train_df['image_path'] = f'{BASE_PATH}/train_images/'+train_df['id'].astype(str)+'.jpeg'
test_df['image_path'] = f'{BASE_PATH}/test_images/'+test_df['id'].astype(str)+'.jpeg'

train_df['image_path']

0        ./data/train_images/101801795.jpeg
1        ./data/train_images/115813315.jpeg
2        ./data/train_images/173551949.jpeg
3        ./data/train_images/148811120.jpeg
4        ./data/train_images/195108876.jpeg
                        ...                
43358    ./data/train_images/172502909.jpeg
43359    ./data/train_images/183294324.jpeg
43360    ./data/train_images/108577580.jpeg
43361    ./data/train_images/139067673.jpeg
43362    ./data/train_images/195383621.jpeg
Name: image_path, Length: 43363, dtype: object

## Data Normalization

In [11]:
# Normalize ancillary data
ancillary_columns = [col for col in train_df.columns if col.startswith(('WORLDCLIM_BIO', 'SOIL', 'MODIS', 'VOD'))]
train_ancillary = train_df[ancillary_columns]
test_ancillary = test_df[ancillary_columns]

# Extract and transform targets
target_columns = [col for col in train_df.columns if col.endswith('_mean')]
train_targets = np.log1p(train_df[target_columns])



In [10]:
train_targets.shape

(43363, 6)

In [12]:

# Min-max normalize the transformed target data
min_train = np.min(train_targets, axis=0)
max_train = np.max(train_targets, axis=0)

min_train_anc = np.min(train_ancillary, axis=0)
max_train_anc = np.max(train_ancillary, axis=0)

min_train_anc.shape



(163,)

In [8]:
train_ancillary.shape

(43363, 163)

In [9]:
# Train on same target vars, not log (maybe remove log and then normalize)
train_targets_norm = (train_targets - min_train) / (max_train - min_train)
train_ancillary_norm = (train_ancillary - min_train_anc) / (max_train_anc - min_train_anc)
# train_targets_norm = np.expm1(train_targets_norm)
train_targets_norm.shape
train_ancillary_norm.shape

test_ancillary_norm = (test_ancillary - min_train_anc) / (max_train_anc - min_train_anc)

In [13]:

# Split the data for training and validation for ancillary model
X_train_img, X_val_img, X_train_anc, X_val_anc, y_train, y_val = train_test_split(
    pd.DataFrame(train_df['image_path'].values), train_ancillary_norm, train_targets_norm, test_size=0.2, random_state=42
)

In [14]:
X_train_anc

Unnamed: 0,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,SOIL_bdod_30.60cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12
16051,0.718492,0.148879,0.021161,0.091475,0.263473,0.311485,0.748148,0.736111,0.740741,0.760870,...,0.537743,0.498417,0.488927,0.466854,0.422058,0.434694,0.458408,0.467584,0.475004,0.484953
10646,0.315960,0.098020,0.031566,0.247534,0.358744,0.314560,0.585185,0.680556,0.577778,0.623188,...,0.701135,0.761288,0.827909,0.819837,0.754272,0.773536,0.802987,0.806616,0.719488,0.693840
24766,0.432484,0.050598,0.017588,0.276269,0.366422,0.342553,0.570370,0.680556,0.600000,0.659420,...,0.522307,0.506097,0.546718,0.578595,0.551355,0.569474,0.566679,0.539655,0.511482,0.514989
20283,0.854769,0.267433,0.132266,0.413720,0.035448,0.099332,0.629630,0.520833,0.555556,0.536232,...,0.557999,0.534780,0.541611,0.533002,0.502840,0.510659,0.519714,0.516524,0.532241,0.554568
8497,0.771299,0.076534,0.058094,0.543314,0.072555,0.238038,0.696296,0.569444,0.629630,0.594203,...,0.628553,0.600274,0.613608,0.627976,0.618468,0.645622,0.670352,0.676652,0.671136,0.666929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.662105,0.094222,0.039694,0.305918,0.161570,0.178936,0.585185,0.645833,0.585185,0.623188,...,0.361392,0.326098,0.346955,0.325294,0.315932,0.319932,0.329771,0.340788,0.342192,0.345865
11284,0.903892,0.312920,0.145513,0.282260,0.024587,0.048891,0.548148,0.604167,0.525926,0.550725,...,0.454581,0.436537,0.446711,0.428182,0.399871,0.411214,0.422085,0.421952,0.435128,0.456732
38158,0.662223,0.088154,0.036556,0.305925,0.169441,0.193052,0.622222,0.652778,0.622222,0.644928,...,0.367083,0.351960,0.353629,0.331461,0.318642,0.337477,0.354532,0.365566,0.368428,0.368840
860,0.590818,0.123774,0.036316,0.195927,0.538892,0.569089,0.785185,0.868056,0.837037,0.869565,...,0.479448,0.504887,0.509196,0.552310,0.582199,0.611363,0.606360,0.558969,0.520983,0.526403


In [15]:
X_train_img.values

array([['./data/train_images/75367331.jpeg'],
       ['./data/train_images/192668447.jpeg'],
       ['./data/train_images/195261292.jpeg'],
       ...,
       ['./data/train_images/184470019.jpeg'],
       ['./data/train_images/196704567.jpeg'],
       ['./data/train_images/194677586.jpeg']], dtype=object)

In [16]:
X_train_anc.iloc[16051]

WORLDCLIM_BIO1_annual_mean_temperature                                   0.495795
WORLDCLIM_BIO12_annual_precipitation                                     0.234190
WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month    0.119638
WORLDCLIM_BIO15_precipitation_seasonality                                0.364546
WORLDCLIM_BIO4_temperature_seasonality                                   0.204074
                                                                           ...   
VOD_X_1997_2018_multiyear_mean_m08                                       0.469550
VOD_X_1997_2018_multiyear_mean_m09                                       0.501000
VOD_X_1997_2018_multiyear_mean_m10                                       0.530023
VOD_X_1997_2018_multiyear_mean_m11                                       0.570208
VOD_X_1997_2018_multiyear_mean_m12                                       0.618869
Name: 41795, Length: 163, dtype: float64

In [80]:
X_train_anc.columns

Index(['WORLDCLIM_BIO1_annual_mean_temperature',
       'WORLDCLIM_BIO12_annual_precipitation',
       'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month',
       'WORLDCLIM_BIO15_precipitation_seasonality',
       'WORLDCLIM_BIO4_temperature_seasonality',
       'WORLDCLIM_BIO7_temperature_annual_range',
       'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg',
       'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg',
       ...
       'VOD_X_1997_2018_multiyear_mean_m03',
       'VOD_X_1997_2018_multiyear_mean_m04',
       'VOD_X_1997_2018_multiyear_mean_m05',
       'VOD_X_1997_2018_multiyear_mean_m06',
       'VOD_X_1997_2018_multiyear_mean_m07',
       'VOD_X_1997_2018_multiyear_mean_m08',
       'VOD_X_1997_2018_multiyear_mean_m09',
       'VOD_X_1997_2018_multiyear_mean_m10',
       'VOD_X_1997_2018_multiyear_mean_m11',
       'VOD_X_1997_2018_multiyear_mean_m12'],
      dtype='object', length=163)

In [81]:
train_targets.size

260178

In [82]:
min_train

X4_mean        0.256900
X11_mean       4.904341
X18_mean       9.888375
X26_mean       8.149139
X50_mean       2.681370
X3112_mean    12.893096
dtype: float64

In [83]:
train_targets_norm

Unnamed: 0,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean
0,0.746281,0.237248,0.038194,0.008633,0.652971,0.118959
1,0.701309,0.524265,0.029574,0.005407,0.259734,0.091455
2,0.998969,0.087936,0.138284,0.000112,0.650511,0.003729
3,0.535422,0.723667,0.144579,0.031804,0.326607,0.315008
4,0.721260,0.541546,0.090828,0.043050,0.424622,0.443002
...,...,...,...,...,...,...
43358,0.743541,0.378794,0.013057,0.000019,0.205110,0.045957
43359,0.916569,0.255141,0.734849,0.135033,0.458783,0.311850
43360,0.631141,0.513172,0.009935,0.000136,0.389111,0.018660
43361,0.661591,0.511980,0.008155,0.000063,0.369059,0.006932


In [84]:
# regression model want to use these features as input, but you need to normalize for input data
# normalize to -1 and 1 by standard scaling it
# drop log? and just use original values but normalized, drop 99.5% and less than 0.5% 

In [17]:
X_train_img

Unnamed: 0,0
16051,./data/train_images/75367331.jpeg
10646,./data/train_images/192668447.jpeg
24766,./data/train_images/195261292.jpeg
20283,./data/train_images/192546297.jpeg
8497,./data/train_images/133774591.jpeg
...,...
6265,./data/train_images/192047767.jpeg
11284,./data/train_images/140018028.jpeg
38158,./data/train_images/184470019.jpeg
860,./data/train_images/196704567.jpeg


## Data Loader for Efficiency

In [18]:
# Feature extractor for ViT
feature_extractor = ViTFeatureExtractor(do_resize=True, size=224).from_pretrained("google/vit-base-patch16-224-in21k")

# Custom dataset class
class PlantDataset(Dataset):
    def __init__(self, image_paths, ancillary_data, targets=None, transform=None):
        self.image_paths = image_paths
        self.ancillary_data = ancillary_data
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # print(f"trying to retrieve id: {idx}")
        # print(f"can't find it: {self.image_paths[idx]}")
        image = plt.imread(str(self.image_paths[idx]))
        if self.transform:
            image = self.transform(image)
        # print(f"gets to anc data of length: {len(self.ancillary_data)}")
        # print(f"{self.ancillary_data.iloc[idx]}")
        ancillary_data = torch.tensor(self.ancillary_data.iloc[idx], dtype=torch.float)
        # print('gets to targets')
        if self.targets is not None:
            target = torch.tensor(self.targets.iloc[idx], dtype=torch.float)
            return image, ancillary_data, target
        return image, ancillary_data

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(CFG.image_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])


# Create datasets
dataset = PlantDataset(train_df['image_path'], train_ancillary_norm, train_targets_norm, transform=transform)
test_dataset = PlantDataset(test_df['image_path'], test_ancillary_norm, transform=transform)
trainset, testset = train_test_split(dataset, test_size= 0.2)

# Create dataloaders
train_loader = DataLoader(trainset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(testset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)


  ancillary_data = torch.tensor(self.ancillary_data.iloc[idx], dtype=torch.float)
  target = torch.tensor(self.targets.iloc[idx], dtype=torch.float)


## Model Definition

In [19]:
class CombinedModel(nn.Module):
    def __init__(self, num_classes, ancillary_dim):
        super(CombinedModel, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.ancillary_net = nn.Sequential(
            nn.Linear(ancillary_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(768 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, images, ancillary):
        vit_features = self.vit(pixel_values=images).pooler_output
        ancillary_features = self.ancillary_net(ancillary)
        combined = torch.cat((vit_features, ancillary_features), dim=1)
        output = self.fc(combined)
        return output

# Initialize model, loss function, and optimizer
model = CombinedModel(num_classes=CFG.num_classes, ancillary_dim=X_train_anc.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

device = torch.device("mps")
print(device)
model.to(device)


mps


CombinedModel(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, 

In [20]:
# XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

# Train XGBoost model on ancillary data
xgb_model.fit(train_ancillary_norm, train_targets_norm)


In [94]:
print(train_df['image_path'])

0        ./data/train_images/101801795.jpeg
1        ./data/train_images/115813315.jpeg
2        ./data/train_images/173551949.jpeg
3        ./data/train_images/148811120.jpeg
4        ./data/train_images/195108876.jpeg
                        ...                
43358    ./data/train_images/172502909.jpeg
43359    ./data/train_images/183294324.jpeg
43360    ./data/train_images/108577580.jpeg
43361    ./data/train_images/139067673.jpeg
43362    ./data/train_images/195383621.jpeg
Name: image_path, Length: 43363, dtype: object


## Model Training

In [24]:
# Training loop
for epoch in range(CFG.epochs):
    print(f"Started epoch {epoch}")
    model.train()
    train_loss = 0.0
    count = 1
    for images, ancillary, targets in train_loader:
        images, ancillary, targets = images.to(device), ancillary.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images, ancillary)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if epoch < 1 and count % 10 == 0:
            print(f"done {count} batches with loss: {train_loss}")    
        count += 1
    print("finished training...testing now.....")
    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for images, ancillary, targets in val_loader:
            images, ancillary, targets = images.to(device), ancillary.to(device), targets.to(device)
            outputs = model(images, ancillary)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{CFG.epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

# Save the best model
torch.save(model.state_dict(), 'best_model.pth')


Started epoch 0
done 10 batches with loss: 0.24581088311970234
done 20 batches with loss: 0.4786947909742594
done 30 batches with loss: 0.716229947283864
done 40 batches with loss: 0.9792259819805622
done 50 batches with loss: 1.2260818034410477
done 60 batches with loss: 1.4803067073225975
done 70 batches with loss: 1.7240476571023464
done 80 batches with loss: 1.9581753592938185
done 90 batches with loss: 2.1982966791838408
done 100 batches with loss: 2.4467715844511986
done 110 batches with loss: 2.670454306527972
done 120 batches with loss: 2.906502839177847
done 130 batches with loss: 3.1484453342854977
done 140 batches with loss: 3.3765039686113596
done 150 batches with loss: 3.6160497032105923
done 160 batches with loss: 3.838909083046019
done 170 batches with loss: 4.037545613013208
done 180 batches with loss: 4.265241087414324
done 190 batches with loss: 4.497844177298248
done 200 batches with loss: 4.727375137619674
done 210 batches with loss: 4.937304326333106
done 220 batch

## Predictions

In [26]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

# Evaluate transformer model
transformer_preds = []
with torch.no_grad():
    for images, ancillary in test_loader:
        images, ancillary = images.to(device), ancillary.to(device)
        outputs = model(images, ancillary)
        transformer_preds.append(outputs.cpu().numpy())
transformer_preds = np.concatenate(transformer_preds, axis=0)

xgb_preds = xgb_model.predict(test_ancillary_norm)

# Combine predictions
combined_preds = (transformer_preds + xgb_preds) / 2


  ancillary_data = torch.tensor(self.ancillary_data.iloc[idx], dtype=torch.float)


In [54]:
test_predictions_transformers = [pred * (max_train - min_train) + min_train for pred in transformer_preds]
test_predictions_xgb = [pred * (max_train - min_train) + min_train for pred in xgb_preds]
test_predictions_xgb = np.expm1(test_predictions_xgb)
# test_predictions_1 = (test_predictions_xgb + test_predictions_xgb)/2
test_predictions = [pred * (max_train - min_train) + min_train for pred in combined_preds]
test_predictions = np.expm1(test_predictions)

In [47]:
test_predictions

array([[1.06485270e+00, 1.47848884e+02, 1.97073509e+04, 3.54538129e+03,
        1.50895952e+01, 4.02236324e+05],
       [1.05089537e+00, 1.48436786e+02, 1.97021496e+04, 3.48798406e+03,
        1.52220723e+01, 3.99093673e+05],
       [9.33579240e-01, 1.49649953e+02, 1.96993165e+04, 3.46116354e+03,
        1.49233403e+01, 3.98142253e+05],
       ...,
       [1.13527719e+00, 1.44030916e+02, 1.97077393e+04, 3.52365345e+03,
        1.52885503e+01, 4.00363069e+05],
       [1.11396390e+00, 1.45240748e+02, 1.97066393e+04, 3.53477351e+03,
        1.50473956e+01, 4.01071167e+05],
       [9.80018360e-01, 1.48824433e+02, 1.96996062e+04, 3.46383393e+03,
        1.50005250e+01, 3.98280493e+05]])

In [55]:
# Prepare submission
submission_df = pd.DataFrame(test_predictions_xgb, columns=target_names['trait_ID'])
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv('submission.csv', index=False)