## Imports

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


## Data Loading and Normalization

In [15]:

# Load CSV files
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
target_names = pd.read_csv('./data/target_name_meta.tsv', sep='\t')


In [16]:


# Normalize ancillary data
ancillary_columns = [col for col in train_df.columns if col.startswith(('WORLDCLIM_BIO', 'SOIL', 'MODIS', 'VOD'))]
scaler = StandardScaler()
train_ancillary = scaler.fit_transform(train_df[ancillary_columns])
test_ancillary = scaler.transform(test_df[ancillary_columns])

# Extract and transform targets
target_columns = [col for col in train_df.columns if col.endswith('_mean')]
train_targets = np.log1p(train_df[target_columns].values)



In [17]:
# Remove outliers exceeding three standard deviations above or below the mean
mean = np.mean(train_targets, axis=0)
std_dev = np.std(train_targets, axis=0)
cut_off = 3 * std_dev
lower, upper = mean - cut_off, mean + cut_off
non_outlier_mask = np.all((train_targets >= lower) & (train_targets <= upper), axis=1)
train_targets = train_targets[non_outlier_mask]
train_ancillary = train_ancillary[non_outlier_mask]
train_df = train_df[non_outlier_mask]

# Min-max normalize the transformed target data
min_train = np.min(train_targets, axis=0)
max_train = np.max(train_targets, axis=0)

train_targets_norm = (train_targets - min_train) / (max_train - min_train)
# Split the data for training and validation
X_train_img, X_val_img, X_train_anc, X_val_anc, y_train, y_val = train_test_split(
    train_df['id'].values, train_ancillary, train_targets_norm, test_size=0.2, random_state=42
)

In [None]:
min_train

In [18]:
train_targets

array([[ 0.7108186 ,  4.96648147,  9.8884208 ,  8.15077225,  2.82388774,
        12.89768237],
       [ 0.68346469,  5.04165741,  9.88841052,  8.15016208,  2.73805972,
        12.89662201],
       [ 0.86451366,  4.92737347,  9.88854021,  8.14916064,  2.82335081,
        12.89323988],
       ...,
       [ 0.64078592,  5.03875191,  9.88838709,  8.14916527,  2.76629749,
        12.89381552],
       [ 0.65930674,  5.03843968,  9.88838496,  8.14915135,  2.761921  ,
        12.89336335],
       [ 0.61358328,  5.0130297 ,  9.88840106,  8.15030064,  2.75938686,
        12.89477162]])

## Dataset Creation + Image Augmentation

In [14]:
def augment_image(image):
    # Apply horizontal and vertical flipping
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    # Adjust contrast, saturation, and brightness
    image = tf.image.random_contrast(image, lower=0.9, upper=1.1)
    image = tf.image.random_saturation(image, lower=0.9, upper=1.1)
    image = tf.image.random_brightness(image, max_delta=0.1)
    # Clip pixel values to range [0, 1]
    image = tf.clip_by_value(image, 0.0, 1.0)
    return image


In [24]:
def load_and_preprocess_image(image_path, img_size=(128, 128)):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, img_size)
    img = img / 255.0  # Normalize to [0, 1] range
    return img

def load_and_preprocess_data(image_id, ancillary_data, target, img_dir, augment=False):
    image_id = tf.strings.as_string(image_id)  # Convert image_id to string
    image_path = tf.strings.join([img_dir, image_id, '.jpeg'])
    image = load_and_preprocess_image(image_path)
    if augment:
        image = augment_image(image)
    return (image, ancillary_data), target

def create_dataset(image_ids, ancillary_data, targets, img_dir, batch_size=32, shuffle=True, augment=False):
    dataset = tf.data.Dataset.from_tensor_slices((image_ids, ancillary_data, targets))
    
    dataset = dataset.map(lambda img_id, anc, tgt: load_and_preprocess_data(img_id, anc, tgt, img_dir, augment=augment),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(image_ids))
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

def load_and_preprocess_augmented_data(image_id, ancillary_data, target, augment_flag, img_dir):
    image_id = tf.strings.as_string(image_id)  # Convert image_id to string
    image_path = tf.strings.join([img_dir, image_id, '.jpeg'])
    image = load_and_preprocess_image(image_path)
    if augment_flag == 1:
        image = augment_image(image)
    return (image, ancillary_data), target


# Duplicate the training dataset for augmentation
# augmented_img_ids = np.tile(X_train_img, 2)
# augmented_ancillary = np.tile(X_train_anc, (2, 1))
# augmented_targets = np.tile(y_train, (2, 1))

# don't duplicate for 0 augmentation
augmented_img_ids = X_train_img
augmented_ancillary = X_train_anc
augmented_targets = y_train

# Set augment flag for the second half of the dataset
augment_flags = np.ones(len(X_train_img))




# Create training and validation datasets with augmentation
# Create the combined original and augmented training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((augmented_img_ids, augmented_ancillary, augmented_targets, augment_flags))

train_dataset = train_dataset.map(lambda img_id, anc, tgt, aug: load_and_preprocess_augmented_data(img_id, anc, tgt, aug, './data/train_images/'),
                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dataset = train_dataset.shuffle(buffer_size=len(augmented_img_ids))
train_dataset = train_dataset.batch(16)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

val_dataset = create_dataset(X_val_img, X_val_anc, y_val, './data/train_images/', batch_size=16, shuffle=False, augment=True)



In [20]:
train_dataset

<_PrefetchDataset element_spec=((TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 163), dtype=tf.float64, name=None)), TensorSpec(shape=(None, 6), dtype=tf.float64, name=None))>

## Model Definition

In [21]:
# Model Definition
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Dropout, concatenate, GlobalAveragePooling2D
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [22]:

def create_combined_model(image_shape, ancillary_shape):
    # Image feature extractor
    base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_shape=image_shape)
    base_model.trainable = False  # Freeze the base model
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    # Ancillary data input
    inputs_anc = Input(shape=(ancillary_shape,))
    y = Dense(128, activation='relu')(inputs_anc)
    y = Dropout(0.5)(y)
    
    # Combine image features with ancillary data
    combined = concatenate([x, y])
    z = Dense(256, activation='relu')(combined)
    z = Dropout(0.5)(z)
    z = Dense(6, activation='linear')(z)
    
    model = Model(inputs=[base_model.input, inputs_anc], outputs=z)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=['mae', 'mse'])
    return model

image_shape = (128, 128, 3)
ancillary_shape = X_train_anc.shape[1]

model = create_combined_model(image_shape, ancillary_shape)
model.summary()


## Training

In [25]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

# Training
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    callbacks=[early_stopping, model_checkpoint]
)

# Verify GPU usage
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Epoch 1/50
[1m1978/1978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 130ms/step - loss: 16768.3965 - mae: 70.7963 - mse: 16768.3965 - val_loss: 32.8303 - val_mae: 1.4666 - val_mse: 32.8303
Epoch 2/50
[1m1978/1978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1324s[0m 668ms/step - loss: 247.4746 - mae: 3.4109 - mse: 247.4746 - val_loss: 7.5060 - val_mae: 0.4943 - val_mse: 7.5060
Epoch 3/50
[1m1978/1978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 105ms/step - loss: 49.5888 - mae: 0.9657 - mse: 49.5888 - val_loss: 3.4258 - val_mae: 0.3467 - val_mse: 3.4258
Epoch 4/50
[1m1978/1978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 107ms/step - loss: 9.5031 - mae: 0.4816 - mse: 9.5031 - val_loss: 1.5336 - val_mae: 0.2914 - val_mse: 1.5336
Epoch 5/50
[1m1978/1978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 107ms/step - loss: 3.7460 - mae: 0.3366 - mse: 3.7460 - val_loss: 0.9966 - val_mae: 0.2449 - val_mse: 0.9966
Epoch 6/50
[1m1978/1978[0m [3

KeyboardInterrupt: 

In [None]:
target_columns

['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']

## Predictions

In [26]:
# Load images
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def load_images(image_dir, image_ids, img_size=(128, 128)):
    images = []
    for image_id in image_ids:
        img_path = os.path.join(image_dir, f'{image_id}.jpeg')
        img = load_img(img_path, target_size=img_size)
        img_array = img_to_array(img)
        images.append(img_array)
    return np.array(images)

In [27]:
test_images = load_images('./data/test_images', test_df['id'])


In [28]:
import numpy as np


# Load the best model
model.load_weights('best_model.keras')


In [37]:

# Make predictions for the test dataset
test_predictions_norm = model.predict([test_images, test_ancillary])
test_predictions_norm



[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 92ms/step


array([[0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291],
       [0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291],
       [0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291],
       ...,
       [0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291],
       [0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291],
       [0.54428643, 0.4242177 , 0.12447885, 0.0570288 , 0.46731362,
        0.14728291]], dtype=float32)

In [35]:
# Reverse the normalization
test_predictions = test_predictions_norm * (max_train - min_train) + min_train

# Exponentiate the predictions to reverse the log1p transformation
test_predictions = np.expm1(test_predictions)

In [36]:
test_predictions

array([[0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172],
       [0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172],
       [0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172],
       ...,
       [0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172],
       [0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172],
       [0.7233782 , 0.5283943 , 0.13255808, 0.0586863 , 0.59570175,
        0.15868172]], dtype=float32)

In [34]:
# Prepare submission
submission_df = pd.DataFrame(test_predictions, columns=target_names['trait_ID'])
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv('submission.csv', index=False)