In [1]:
import torch
import numpy as np

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    print(f'Random seed {seed} has been set.')
set_seed(2021)

In [2]:
cfg = {
    # data
    'Kfold': 5,
    'batch_size' : 64,
    'image_size': 224,
    'crop_pct': 0.875,
    'interpolation': 'bicubic',
    # model
    'name': 'resnetv2_50x1_bit_distilled',
    'precision': 32,
    'drop_path_rate': 0.0,
    'drop_rate': 0.0
}
from psutil import *
num_workers = cpu_count()
num_workers

In [3]:
import cv2
from PIL import Image

class TestPawpularDataset:
    def __init__(self, image_paths, augmentations):
        self.image_paths = image_paths
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        
        if self.augmentations is not None:
            image = self.augmentations(image)
        
        return image
    
class PawpularDataset:
    def __init__(self, image_paths, targets, augmentations):
        self.targets = targets
        self.image_paths = image_paths
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        
        if self.augmentations is not None:
            image = self.augmentations(image)
        
        target = (self.targets[idx]).reshape(-1).astype(np.float32)
        
        return image, target

In [4]:
import sys
sys.path.append("../input/d/chemicalbrainx/timmasters/")
import timm
import torchmetrics
import torch.nn as nn
import pytorch_lightning as pl

class TestPawpularModel(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.backbone = timm.create_model(
            self.cfg['name'], pretrained=False,
            num_classes=0, in_chans=3,
            drop_path_rate=self.cfg['drop_path_rate'],
            drop_rate=self.cfg['drop_rate']
        )
        num_features = self.backbone.num_features

        # mean estimator head
        self.fc_mean = nn.Sequential(
            nn.Linear(num_features, 1),
            nn.Sigmoid()
        )

        # var estimator head
        self.fc_var = nn.Sequential(
            nn.Linear(num_features, 1),
            nn.Sigmoid()
        )
        
    def forward(self, images):
        x = self.backbone(images)
        mean = self.fc_mean(x)
        var = self.fc_var(x)
        return mean, var

In [5]:
from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
from timm.data.transforms_factory import transforms_imagenet_eval
from torchvision.transforms import transforms

test_aug = transforms_imagenet_eval(
                                    img_size=cfg['image_size'],
                                    crop_pct=cfg['crop_pct'],
                                    interpolation=cfg['interpolation'],
                                    use_prefetcher=False,
                                    mean=IMAGENET_INCEPTION_MEAN,
                                    std=IMAGENET_INCEPTION_STD)
test_aug.transforms[1] = transforms.Compose([transforms.RandomCrop(size=(cfg['image_size'], cfg['image_size'])),
                                             transforms.RandomHorizontalFlip(p=0.5)])

test_aug.transforms[2] = transforms.Compose([transforms.ToTensor(),
                                             transforms.Lambda(lambda x: x.repeat(int(len(x)==1)*3 + int(len(x)==3), 1, 1))])
test_aug

In [6]:
from torch.utils.data import DataLoader
import pandas as pd
def get_test_dataset():
    df_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
    test_img_paths = [f"../input/petfinder-pawpularity-score/test/{x}.jpg" for x in df_test["Id"].values]
    test_dataset = TestPawpularDataset(
        augmentations=test_aug,
        image_paths=test_img_paths)
    return test_dataset

def get_kfold_dataset(fold):
    df = pd.read_csv("../input/pawpularitykfold/train_%dfolds.csv"%cfg['Kfold'])
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_train["Id"].values]
    valid_img_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_valid["Id"].values]

    valid_dataset = PawpularDataset(
        image_paths=valid_img_paths,
        targets=df_valid.Pawpularity.values,
        augmentations=test_aug,
    )
    
    test_dataset = TestPawpularDataset(
        image_paths=valid_img_paths,
        augmentations=test_aug,
    )
    return valid_dataset, test_dataset


def get_test_dataloader(test_dataset):
    test_dataset = DataLoader(test_dataset, batch_size=cfg['batch_size'], shuffle=False, num_workers=num_workers)
    return test_dataset

In [7]:
import matplotlib.pyplot as plt

def avg_preds(preds, vars): 
    vars = torch.tensor(vars)
    orig_preds = torch.tensor(preds)
    peak_average = torch.mean(orig_preds, dim=0)
    vars_average = torch.mean(vars, dim=0)
    return peak_average.numpy(), vars_average.numpy()

def get_preds(model, test_loader):
    preds = []
    preds_var = []
    trainer = pl.Trainer(gpus=1, deterministic=False)
    model_preds = trainer.predict(model, test_loader)
    for batch_preds in model_preds:
        means, vars = batch_preds
        means = means * 100
        preds += means.detach().cpu().numpy().flatten().tolist()
        preds_var += vars.detach().cpu().numpy().flatten().tolist()
        
    preds = np.array(preds)
    preds_var = np.array(preds_var)
    return preds, preds_var

In [8]:
import os
instance_repeats = 20
test_datadoader = get_test_dataloader(get_test_dataset())

checkpoint_path = '../input/firstone/'
super_final_preds = []
super_final_preds_var = []
for checkpoint in os.listdir(checkpoint_path):
    model = TestPawpularModel.load_from_checkpoint(
            cfg=cfg,
            checkpoint_path=checkpoint_path+checkpoint
    )
    for _ in range(instance_repeats):
        final_preds, final_preds_var = get_preds(model, test_datadoader)
        super_final_preds += [final_preds]
        super_final_preds_var += [final_preds_var]
      
good_preds,_ = avg_preds(super_final_preds, super_final_preds_var) # Use mode as an indicator of the best prediction, i.e. the prediction that occured the most

df_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
df_test["Pawpularity"] = good_preds
df_test = df_test[["Id", "Pawpularity"]]
df_test.to_csv("submission.csv", index=False)
df_test.tail()

In [9]:
# sample = 6
# import matplotlib.pyplot as plt
# plt.hist(np.array(super_final_preds)[:, sample], bins=20)
# plt.xlabel('Pawpularity scores of random crops for sample=%d'%sample)
# plt.show()
# print(good_preds[sample])
# plt.hist(scores[:, sample], bins=20)
# plt.xlabel('score of sample=%d'%sample)
# plt.show()

In [10]:
# checkpoints in fold 1 to 5 order
checkpoints = ['8a8371422d81dcf484aacc5a25084e3f',
               'dabb4442e85f1474442fb872f7b11049',
               'c94895f37dd8075139544b46a84567ca',
               '408d0061329c33528e4219ca0ec855fd',
               '5ba4c2a62f3f9adaf2d17949e617fc6d']

In [11]:
# fold = 5
# checkpoint = checkpoints[fold-1]
# valid_dataset, test_dataset = get_kfold_dataset(fold-1)
# test_loader = get_test_dataloader(test_dataset)

# hyperparameters = [10, 15, 20, 25] # bins
# hyperparameters_rmse = []
# for hyperparameter in hyperparameters:
#     print('hyperparmeter: ',hyperparameter)
#     super_final_preds = []
#     super_final_preds_var = []
#     model = TestPawpularModel.load_from_checkpoint(cfg=cfg,
#             checkpoint_path=checkpoint_path+checkpoint)
#     iters_num = cfg['Kfold']*instance_repeats
#     for i in range(iters_num): # in order to try multiple crops
#         if i % 10 == 0:
#             print('%d / %d'%(i, iters_num))
#         final_preds, final_preds_var = get_preds(model, test_loader)
#         super_final_preds += [final_preds]
#         super_final_preds_var += [final_preds_var]

# #     good_preds, scores = average_good_preds(super_final_preds, super_final_preds_var, hyperparameter)
#     good_preds,_ = avg_preds(super_final_preds, super_final_preds_var, bins=hyperparameter) # Use mode as an indicator of the best prediction, i.e. the prediction that occured the most

#     mse = 0
#     for i, p in enumerate(good_preds):
#         x, pawpularity = valid_dataset[i]
#         mse += (pawpularity - p)**2
#     RMSE = np.sqrt(mse / len(good_preds))
#     print(RMSE)
        
#     hyperparameters_rmse.append(RMSE / len(checkpoints))
    
# plt.plot(hyperparameters, hyperparameters_rmse)
# plt.xlabel('Kfold Threshold')
# plt.ylabel('5Fold RMSE')
# plt.title('Test time hypyerparmeter tunning')
# plt.show()
# best_bin = hyperparameters[np.argmin(hyperparameters_rmse)]
# print('Best bin number:', best_bin)
# plt.hist(np.array(super_final_preds)[:, 20], bins=20)
# plt.xlabel('Pawpularity scores of random crops for sample=%d'%sample)
# plt.show()

In [17]:
class TestAdoptionDataset:
    def __init__(self, test_img_ids, augmentations, test_img_counts, unique_indices, test_img_posts):
        self.augmentations = augmentations
        self.test_img_ids = test_img_ids
        self.test_img_posts = test_img_posts
        self.test_img_counts = test_img_counts
        self.unique_indices = unique_indices
        self.path = '../input/pet-adoption-speed-dataset/new_unlabeled_images/new_unlabeled_images/'
        
    def __len__(self):
        return len(self.test_img_ids)
    
    def __getitem__(self, idx):
        rand_int = np.random.randint(test_img_counts[idx])
        rand_postfix = str(self.test_img_posts[self.unique_indices[idx] + rand_int])
        image = Image.open(self.path + self.test_img_ids[idx]+rand_postfix)
        
        if self.augmentations is not None:
            image = self.augmentations(image)
        
        return image

In [18]:
avg_instances = 3.85

df_test = pd.read_csv("../input/pet-adoption-speed-dataset/new_unlabeled_images/new_unlabeled_images.csv")
sorted_ids = sorted(df_test["Id"].values)

test_img_ids = [x.split('-')[0] for x in sorted_ids]
test_img_posts = ['-'+x.split('-')[1] for x in sorted_ids]
test_img_ids, unique_indices, test_img_counts = np.unique(test_img_ids,
                                                          return_counts=True,
                                                          return_index=True)
test_dataset = TestAdoptionDataset(
    augmentations=test_aug,
    test_img_ids=test_img_ids,
    test_img_posts = test_img_posts,
    test_img_counts=test_img_counts,
    unique_indices=unique_indices)
test_loader = get_test_dataloader(test_dataset)
    
for fold in range(cfg['Kfold']):
    checkpoint = checkpoints[fold]
    super_final_preds = []
    super_final_preds_var = []
    model = TestPawpularModel.load_from_checkpoint(cfg=cfg,
            checkpoint_path=checkpoint_path+checkpoint)
    iters_num = int(cfg['Kfold']*instance_repeats/avg_instances)
    for i in range(iters_num): # in order to try multiple crops
        if i % 10 == 0:
            print('%d / %d'%(i, iters_num))
        final_preds, final_preds_var = get_preds(model, test_loader)
        super_final_preds += [final_preds]
        super_final_preds_var += [final_preds_var]

    avg_good_preds, avg_vars = avg_preds(super_final_preds, super_final_preds_var)

    df_test = pd.DataFrame()
    df_test["id"] = test_img_ids
    df_test["avg_pawpularity"] = avg_good_preds
    df_test["avg_var"] = avg_vars
    df_test.to_csv("adoption_preds_fold%d.csv"%(fold+1), index=False)
    df_test.tail()
    
while True:
    pass