In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

really appreciate this notebook support.  
https://www.kaggle.com/adityasharma01/fork-of-17-90508-notebook/notebook

In [None]:
!ls ../input/timm-module/Desktop/pytorch-image-models

In [33]:
import sys
sys.path.append('../input/timm-module/Desktop/pytorch-image-models')
from timm import create_model
from fastai.vision.all import *

In [34]:
!ls ../input/timm-model

In [35]:
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
    os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/timm-model/swin_large_patch4_window7_224_22kto1k.pth' '/root/.cache/torch/hub/checkpoints/swin_large_patch4_window7_224_22kto1k.pth'

In [36]:
dataset_path = Path('../input/petfinder-pawpularity-score/')
dataset_path.ls()
train = pd.read_csv(dataset_path/"train.csv")
test = pd.read_csv(dataset_path/"test.csv")
sample_submission = pd.read_csv(dataset_path/"sample_submission.csv")

In [37]:
train.shape, test.shape, sample_submission.shape

In [38]:
train.head()

In [39]:
sample_submission.head()

In [40]:
train['path'] = train['Id'].map(lambda x:str(dataset_path/'train'/x)+'.jpg')
train = train.drop(columns=['Id'])
train = train.sample(frac=1).reset_index(drop=True) #shuffle dataframe
train.head()

In [41]:
train['Pawpularity'].hist(figsize = (10, 5))
print(f"The mean Pawpularity score is {train['Pawpularity'].mean()}")
print(f"The median Pawpularity score is {train['Pawpularity'].median()}")
print(f"The standard deviation of the Pawpularity score is {train['Pawpularity'].std()}")

In [42]:
print(f"There are {len(train['Pawpularity'].unique())} unique values of Pawpularity score")

Note that the Pawpularity score is an integer, so in addition to being a regression problem, it could also be treated as a 100-class classification problem. Alternatively, it can be treated as a binary classification problem if the Pawpularity Score is normalized between 0 and 1:

In [43]:
train['norm_score'] = train['Pawpularity']/100
train['norm_score']

In [44]:
im = Image.open(train['path'][1])
width, height = im.size
print(width,height)

In [45]:
seed = 999
set_seed(seed, reproducible=True)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms = True

In [46]:
#Sturges' rule スタージェスの公式（ヒストグラムの階級数の決め方） 
num_bins = int(np.floor(1+(3.3)*(np.log2(len(train)))))
num_bins

In [47]:
train['bins'] = pd.cut(train['norm_score'], bins=num_bins, labels=False)
train['bins'].hist()

In [48]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

train['fold'] = -1

N_FOLDS = 5
strat_kfold = StratifiedKFold(n_splits=N_FOLDS, random_state=seed, shuffle=True)
for i, (_, train_index) in enumerate(strat_kfold.split(train.index, train['bins'])):
    print(i,train_index)
    train.iloc[train_index, -1] = i
    
train['fold'] = train['fold'].astype('int')
train.fold.value_counts().plot.bar()

In [49]:
train[train['fold']==0].head()

In [50]:
def petfinder_rmse(input,target):
    return 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

In [51]:
BATCH_SIZE = 8

In [52]:
def get_loader(fold):
    train_copy = train.copy()
    # add is_valid for validation fold
    train_copy['is_valid'] = (train_copy['fold'] == fold)
    
    dls = ImageDataLoaders.from_df(
        train_copy, #pass in train DataFrame
        valid_pct=0.2, #80-20 train-validation random split
        valid_col='is_valid', #
        seed=seed, #seed
        fn_col='path', #filename/path is in the second column of the DataFrame
        label_col='norm_score', #label is in the first column of the DataFrame
        y_block=RegressionBlock, #The type of target
        bs=BATCH_SIZE, #pass in batch size
        num_workers=8,
        item_tfms=Resize(224),
        batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation(), Flip()])  ## add Flip augmentation
    )
    
    return dls

In [53]:
#Valid Kfolder size
loader = get_loader(0)
assert (len(loader.train) + len(loader.valid)) == (len(train)//BATCH_SIZE)

In [54]:
loader.show_batch(max_n=BATCH_SIZE)

In [55]:
def get_learner(fold_num):
    loader = get_loader(fold_num)
    model = create_model('swin_large_patch4_window7_224', pretrained=True, num_classes=loader.c)
    learner = Learner(loader, model, loss_func=BCEWithLogitsLossFlat(), metrics=petfinder_rmse).to_fp16()
    return learner

In [56]:
test

In [57]:
test['Pawpularity'] = [1]*len(test)
test['path'] = test['Id'].map(lambda x:str(dataset_path/'test'/x)+'.jpg')
test = test.drop(columns=['Id'])
train['norm_score'] = train['Pawpularity']/100

In [58]:
get_learner(fold_num=0).lr_find(end_lr=3e-2)

In [60]:
get_learner(fold_num=0).lr_find()

In [62]:
import gc

In [63]:
train.shape

In [64]:
all_preds = []

for i in range(N_FOLDS):
    print(f'Fold {i} results')
    learn = get_learner(fold_num=i)
    learn.fit_one_cycle(5, 2e-5, cbs=[SaveModelCallback(), EarlyStoppingCallback(monitor='petfinder_rmse', comp=np.less, patience=2)]) 
    learn.recorder.plot_loss()
    #learn = learn.to_fp32()
    
    #learn.export(f'model_fold_{i}.pkl')
    #learn.save(f'model_fold_{i}.pkl')
    
    dls = ImageDataLoaders.from_df(
        train, #pass in train DataFrame
        valid_pct=0.2, #80-20 train-validation random split
        seed=seed, #seed
        fn_col='path', #filename/path is in the second column of the DataFrame
        label_col='norm_score', #label is in the first column of the DataFrame
        y_block=RegressionBlock, #The type of target
        bs=BATCH_SIZE, #pass in batch size
        num_workers=8,
        item_tfms=Resize(224), #pass in item_tfms
        batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Hue(), Saturation(), Flip(p=0.5)])  ## add Flip
    ) 
    
    test_dl = dls.test_dl(test)
    preds, _ = learn.tta(dl=test_dl, n=8, beta=0)  ## n=5 to n=8
    all_preds.append(preds)
    del learn
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# train 911, val 248

In [70]:
all_preds

In [71]:
np.mean(np.stack(all_preds*100))

In [67]:
sample_df = pd.read_csv(dataset_path/'sample_submission.csv')
preds = np.mean(np.stack(all_preds), axis=0)
sample_df['Pawpularity'] = preds*100
sample_df.to_csv('submission.csv',index=False)

In [72]:
pd.read_csv('submission.csv')