In [1]:
import numpy as np
import pandas as pd
import os
import pickle

import albumentations
import cv2
import timm
import torch.nn as nn
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

import pytorch_lightning as pl
from pytorch_lightning import callbacks
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import LightningDataModule

import glob
import matplotlib.pyplot as plt

import clip
from PIL import Image
from joblib import dump, load

In [2]:
SEED = 2023

DATA_DIR = 'data'

TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'train')
TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'test')

OUTPUT_DIR = "output"

In [3]:
def create_folds(data: pd.DataFrame, num_splits = 5):
    data["fold"] = -1
    num_bins = int(np.floor(1 + np.log2(len(data)))) # sturge's rule

    data.loc[:, "bins"] = pd.cut(data["Pawpularity"], bins=num_bins, labels=False)
    skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)

    for fold_index, (train_idx, val_idx) in enumerate(skf.split(X=data, y=data.bins.values)):
        data.loc[val_idx, 'fold'] = fold_index

    data = data.drop('bins', axis=1)

    return data

In [4]:
dense_features = [
    'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
    'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'
]

class PetFinderClipDataset(Dataset):
    def __init__(self, df: pd.DataFrame, dir: str, augmentations: albumentations.Compose):
        self.ids = df["Id"].values
        if "Pawpularity" in df.keys():
            self.targets = df["Pawpularity"].values
        else:
            self.targets = [-1] * len(df)
        self.dense_features = df[dense_features].values

        image_paths = [os.path.join(dir, f"{x}.jpg") for x in df["Id"].values]
        self.image_paths = image_paths

        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        image_id = self.ids[item]

        image = Image.open(os.path.join(self.image_paths[item]))
        if self.augmentations is not None:
            image = self.augmentations(image)
        
        features = self.dense_features[item, :]
        targets = self.targets[item]
        
        return image_id, torch.tensor(features, dtype=torch.float), image, torch.tensor(targets, dtype=torch.float)
    
class PetFinderClipDataModule(LightningDataModule):
    def __init__(self, 
                 df_train=None, df_val=None, df_test=None, 
                 train_images_dir=None, val_images_dir=None, test_images_dir=None, 
                 train_augmentations=None, val_augmentations=None, test_augmentations=None, 
                 batch_size=64
                ):
        super().__init__()
        self.df_train = df_train
        self.df_val = df_val
        self.df_test = df_test

        self.train_images_dir = train_images_dir
        self.val_images_dir = val_images_dir
        self.test_images_dir = test_images_dir

        self.train_augmentations = train_augmentations
        self.val_augmentations = val_augmentations
        self.test_augmentations = test_augmentations

        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(PetFinderClipDataset(self.df_train, self.train_images_dir, self.train_augmentations), batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(PetFinderClipDataset(self.df_val, self.val_images_dir, self.val_augmentations), batch_size=self.batch_size, shuffle=False)
    
    def test_dataloader(self):
        return DataLoader(PetFinderClipDataset(self.df_test, self.test_images_dir, self.test_augmentations), batch_size=self.batch_size, shuffle=False) 

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

with open(os.path.join(OUTPUT_DIR, 'base_models', 'RN50x16.pkl'), 'wb') as f:
    pickle.dump(clip.load("RN50x16", device=device), f)

In [15]:
with open(os.path.join(OUTPUT_DIR, 'base_models', 'RN50x16.pkl'), 'rb') as f:
    model, preprocess = pickle.load(f)

model, preprocess

(CLIP(
   (visual): ModifiedResNet(
     (conv1): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
     (bn1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (relu1): ReLU(inplace=True)
     (conv2): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     (bn2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (relu2): ReLU(inplace=True)
     (conv3): Conv2d(48, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     (bn3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (relu3): ReLU(inplace=True)
     (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
     (layer1): Sequential(
       (0): Bottleneck(
         (conv1): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
         (bn1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
         (relu1): ReLU(in

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50x16", device=device)

100%|███████████████████████████████████████| 338M/338M [00:37<00:00, 9.43MiB/s]


In [16]:
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=SEED, shuffle=True, stratify=df_train['Pawpularity'])
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [17]:
def get_features(dataloader):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for image_ids, features, images, labels in tqdm(dataloader):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

In [18]:
datamodule = PetFinderClipDataModule(
    df_train=df_train, train_images_dir=TRAIN_IMAGES_DIR, train_augmentations=preprocess,
    df_val=df_val, val_images_dir=TRAIN_IMAGES_DIR, val_augmentations=preprocess,
    batch_size=64
)

In [19]:
train_features, train_labels = get_features(datamodule.train_dataloader())

100%|██████████| 124/124 [04:35<00:00,  2.22s/it]


In [20]:
train_features.shape, train_labels.shape

((7929, 768), (7929,))

In [50]:
clf = make_pipeline(StandardScaler(), SVR(C=16.0, kernel='rbf', degree=3))
clf.fit(train_features, train_labels.clip(1, 85))

In [59]:
dump(clf, os.path.join(OUTPUT_DIR, 'svm_clip.joblib'))
clf = load(os.path.join(OUTPUT_DIR, 'svm_clip.joblib'))

In [23]:
val_features, val_labels = get_features(datamodule.val_dataloader())

100%|██████████| 31/31 [00:57<00:00,  1.85s/it]


In [63]:
def get_predictions_with_clip_from_dataframe(df: pd.DataFrame, clf, image_dir: str = TRAIN_IMAGES_DIR, val_features=None, val_labels=None):
    if val_features is None or val_labels is None:
        datamodule = PetFinderClipDataModule(df_val=df, val_images_dir=image_dir, val_augmentations=preprocess)
        val_features, val_labels = get_features(datamodule.val_dataloader())

    val_predictions = clf.predict(val_features)
    df["Pawpularity_prediction"] = val_predictions

    return df

In [64]:
df_train_with_predictions = get_predictions_with_clip_from_dataframe(df_train, clf)
mean_squared_error(df_train_with_predictions['Pawpularity'], df_train_with_predictions['Pawpularity_prediction'] * 1.032, squared=False)

  0%|          | 0/124 [00:00<?, ?it/s]

100%|██████████| 124/124 [01:40<00:00,  1.24it/s]


12.622436167156513

In [55]:
df_val_with_predictions = get_predictions_with_clip_from_dataframe(df_val, clf, val_features=val_features, val_labels=val_labels)

In [56]:
df_val_with_predictions

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,Pawpularity_prediction
0,763c0fbfd75593c2911a33603c28dc45,0,0,0,0,0,0,1,0,0,0,0,0,18,38.882134
1,ba6faa385160bedd384aed93329779fe,0,1,1,0,0,0,1,0,0,0,1,0,52,40.015513
2,dcae448678ce2779659104d1f5970e92,0,1,1,1,0,0,0,0,0,0,0,0,32,28.606759
3,eab6c63493b68e1ab3817a094622be6f,0,0,1,0,0,0,0,1,0,0,1,1,35,39.530952
4,291320168a3fb0d981b4e5ef5c26a307,0,1,1,1,0,0,0,0,0,0,0,0,23,37.226147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978,5130a71b585694927eda397d93a41e4d,0,1,1,1,0,0,0,0,0,0,0,0,34,38.440810
1979,d77cbb32318b41770954ed1a73e35dcb,0,1,1,1,0,1,0,0,0,0,0,0,48,37.002253
1980,feefa7d5530149a0538313c9815a90ad,0,1,1,1,0,0,0,0,0,0,0,0,20,39.617699
1981,60e31d807e3743cdd09130e2cf7c0d50,0,0,1,1,0,0,0,0,0,0,0,1,30,29.235151


In [61]:
mean_squared_error(df_val_with_predictions['Pawpularity'], df_val_with_predictions['Pawpularity_prediction'] * 1.032, squared=False)

18.103693287927232