In [27]:
import numpy as np
import pandas as pd
import os

import albumentations
import cv2
import timm
import torch.nn as nn
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import pytorch_lightning as pl
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import LightningDataModule

import glob
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split

import clip
from PIL import Image
from joblib import dump, load
from timm.data.transforms_factory import create_transform
from timm.data import resolve_data_config

import time
import gc

In [2]:
SEED = 2023

DATA_DIR = 'data'

TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'train')
TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'test')

OUTPUT_DIR = "output"

In [3]:
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

df_train['path'] = df_train['Id'].map(lambda x: os.path.join(TRAIN_IMAGES_DIR, f'{x}.jpg'))
df_test['path'] = df_test['Id'].map(lambda x: os.path.join(TEST_IMAGES_DIR, f'{x}.jpg'))

In [4]:
def create_folds(data: pd.DataFrame, num_splits = 5):
    data["fold"] = -1
    num_bins = int(np.floor(1 + np.log2(len(data)))) # sturge's rule

    data.loc[:, "bins"] = pd.cut(data["Pawpularity"], bins=num_bins, labels=False)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)

    for fold_index, (train_idx, val_idx) in enumerate(skf.split(X=data, y=data.bins.values)):
        data.loc[val_idx, 'fold'] = fold_index

    data = data.drop('bins', axis=1)

    return data

In [5]:
df_train_5 = create_folds(df_train)

# Extract Image Embeddings from Each Architecture

In [6]:
names = [
    'deit_base_distilled_patch16_384',
    'ig_resnext101_32x48d',
    'repvgg_b0',
    'resnetv2_152x4_bitm',
    'swsl_resnext101_32x8d',
    'tf_efficientnet_l2_ns_475',
    'vit_base_patch16_384',
    'vit_large_r50_s32_384',
]

In [7]:
class PawpularDataset:
    def __init__(self, image_paths, modelcfg=None):
        self.image_paths = image_paths
        self.transform = create_transform(**modelcfg)
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        img = Image.open(self.image_paths[item]).convert('RGB')
        img = self.transform(img)
        return img

In [12]:
def create_embeddings(df: pd.DataFrame, model_names: list[str]):
    embeddings = {}
    for model_name in model_names:
        starttime = time.time()
        model = timm.create_model(model_name=model_name, pretrained=True).to('cuda')
        model.eval()

        train_dataset = PawpularDataset(image_paths=df['path'].values, modelcfg=resolve_data_config({}, model=model))
        train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False)

        print(model_name, resolve_data_config({}, model=model_name))

        with torch.no_grad():
            result = [model(img.to('cuda')).cpu().numpy() for img in train_dataloader]
        result = np.concatenate(result, 0)

        embeddings[model_name] = result
        print(f"{model_name}: {int(time.time() - starttime)}s")

    return embeddings

In [13]:
train_embeddings = create_embeddings(df_train, names)

deit_base_distilled_patch16_384 {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
deit_base_distilled_patch16_384: 1193s


  model = create_fn(


Downloading model.safetensors:   0%|          | 0.00/356M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ig_resnext101_32x48d {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
ig_resnext101_32x48d: 406s


Downloading model.safetensors:   0%|          | 0.00/63.5M [00:00<?, ?B/s]

repvgg_b0 {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
repvgg_b0: 133s


  model = create_fn(


Downloading model.safetensors:   0%|          | 0.00/3.75G [00:00<?, ?B/s]

resnetv2_152x4_bitm {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
resnetv2_152x4_bitm: 3774s


  model = create_fn(


Downloading model.safetensors:   0%|          | 0.00/356M [00:00<?, ?B/s]

swsl_resnext101_32x8d {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
swsl_resnext101_32x8d: 314s


  model = create_fn(


tf_efficientnet_l2_ns_475 {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
tf_efficientnet_l2_ns_475: 2398s


Downloading model.safetensors:   0%|          | 0.00/347M [00:00<?, ?B/s]

vit_base_patch16_384 {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
vit_base_patch16_384: 705s


Downloading model.safetensors:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

vit_large_r50_s32_384 {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
vit_large_r50_s32_384: 790s


In [17]:
dump(train_embeddings, os.path.join(OUTPUT_DIR, 'train_embeddings.joblib'))

['output\\train_embeddings.joblib']

In [21]:
train_embeddings = load(os.path.join(OUTPUT_DIR, 'train_embeddings.joblib'))

In [22]:
for m in train_embeddings.keys():
    print(f"{m}: {train_embeddings[m].shape}")

deit_base_distilled_patch16_384: (9912, 1000)
ig_resnext101_32x48d: (9912, 1000)
repvgg_b0: (9912, 1000)
resnetv2_152x4_bitm: (9912, 1000)
swsl_resnext101_32x8d: (9912, 1000)
tf_efficientnet_l2_ns_475: (9912, 1000)
vit_base_patch16_384: (9912, 1000)
vit_large_r50_s32_384: (9912, 1000)


In [54]:
for model_name in train_embeddings.keys():
    model = timm.create_model(model_name=model_name, pretrained=True)
    torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'base_models', f'{model_name}.ckpt'))

  model = create_fn(
  model = create_fn(
  model = create_fn(
  model = create_fn(


In [50]:
def fit_svr(df_train: pd.DataFrame, embeddings_train: np.array, df_test: pd.DataFrame, embeddings_test: np.array):
    train_predictions = np.zeros(df_train.shape[0])
    test_predictions = np.zeros(df_test.shape[0])

    n_folds = df_train['fold'].max() + 1
    for fold in tqdm(range(n_folds)):
        train_idx = df_train['fold'] != fold
        val_idx = df_train['fold'] == fold

        model = make_pipeline(StandardScaler(), SVR(C=16.0, kernel='rbf', degree=3, max_iter=10000))
        model.fit(train_embeddings[train_idx], df_train['Pawpularity'][train_idx])

        train_predictions[val_idx] = np.clip(model.predict(train_embeddings[val_idx]), 1, 100)
        test_predictions += np.clip(model.predict(embeddings_test), 1, 100)

        del model
        gc.collect()

    test_predictions /= n_folds

    return train_predictions, test_predictions

In [51]:
for col in names:
    predictions = fit_svr(df_train, train_embeddings[col].copy())
    print(f"{col}: {mean_squared_error(df_train['Pawpularity'], predictions, squared=False)}")

100%|██████████| 5/5 [02:23<00:00, 28.73s/it]


deit_base_distilled_patch16_384: 17.746481730444195


100%|██████████| 5/5 [02:23<00:00, 28.68s/it]


ig_resnext101_32x48d: 17.838338085632664


100%|██████████| 5/5 [02:24<00:00, 28.95s/it]


repvgg_b0: 18.29408309587672


100%|██████████| 5/5 [02:24<00:00, 28.81s/it]


resnetv2_152x4_bitm: 18.115660189738307


100%|██████████| 5/5 [02:17<00:00, 27.58s/it]


swsl_resnext101_32x8d: 17.963666051415686


100%|██████████| 5/5 [02:17<00:00, 27.51s/it]


tf_efficientnet_l2_ns_475: 17.621301474475345


100%|██████████| 5/5 [02:17<00:00, 27.46s/it]


vit_base_patch16_384: 17.912905349526373


100%|██████████| 5/5 [02:20<00:00, 28.08s/it]

vit_large_r50_s32_384: 18.010305929024177





In [52]:
concatenated_embeddings = np.concatenate([train_embeddings[k] for k in names], 1)
predictions = fit_svr(df_train, concatenated_embeddings.copy())

100%|██████████| 5/5 [18:15<00:00, 219.03s/it]

Overall MSE: 17.315638205844333





In [53]:
print(f"Overall MSE: {mean_squared_error(df_train['Pawpularity'], predictions, squared=False)}")

Overall MSE: 17.315638205844333


In [55]:
pd.__version__

'2.1.1'