In [2]:

# libraries
import os
import time
import subprocess
import numpy as np
import pandas as pd
import ast
import cv2
import PIL.Image
import matplotlib.pyplot as plt
import timm
%matplotlib inline
import seaborn as sns
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
# from warmup_scheduler import GradualWarmupScheduler
import albumentations
import torch.cuda.amp as amp
# import segmentation_models_pytorch as smp
from tqdm import tqdm

scaler = amp.GradScaler()
# device = torch.device('cuda')
from torchvision.io import read_image

TRAIN_DIR = 'dataset/train/train/'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')




In [None]:
# create pandas dataframe to store the results
df = None
df = pd.DataFrame(columns=['filepath', 'id', 'labels'])

DOGS_TRAIN = os.path.join(TRAIN_DIR, 'dogs')
CATS_TRAIN = os.path.join(TRAIN_DIR, 'cats')


In [None]:

for image in os.listdir(DOGS_TRAIN):
    # print(os.path.join(DOGS_TRAIN, image))
    file_path = os.path.join(DOGS_TRAIN, image)
    df = df.append({
        'filepath': file_path,
        'id': image, 'labels': 1
    }, ignore_index=True)

for image in os.listdir(CATS_TRAIN):
    # print(os.path.join(CATS_TRAIN, image))
    file_path = os.path.join(CATS_TRAIN, image)
    df = df.append({
        'filepath': file_path,
        'id': image, 'labels': 0
    }, ignore_index=True)
# save the dataframe to csv
# df.to_csv('train_labels.csv', index=False)

In [3]:

df = pd.read_csv('train_labels.csv')
df['labels'].value_counts()

1    12500
0    12500
Name: labels, dtype: int64

In [5]:
df

Unnamed: 0,filepath,id,labels
0,dataset/train/train/dogs/dog.9265.jpg,dog.9265.jpg,1
1,dataset/train/train/dogs/dog.4445.jpg,dog.4445.jpg,1
2,dataset/train/train/dogs/dog.3578.jpg,dog.3578.jpg,1
3,dataset/train/train/dogs/dog.7244.jpg,dog.7244.jpg,1
4,dataset/train/train/dogs/dog.7907.jpg,dog.7907.jpg,1
...,...,...,...
24995,dataset/train/train/cats/cat.9419.jpg,cat.9419.jpg,0
24996,dataset/train/train/cats/cat.3532.jpg,cat.3532.jpg,0
24997,dataset/train/train/cats/cat.2772.jpg,cat.2772.jpg,0
24998,dataset/train/train/cats/cat.10915.jpg,cat.10915.jpg,0


In [10]:
df.sample(frac=.2, random_state=42)

Unnamed: 0,filepath,id,labels
6868,dataset/train/train/dogs/dog.11971.jpg,dog.11971.jpg,1
24016,dataset/train/train/cats/cat.736.jpg,cat.736.jpg,0
9668,dataset/train/train/dogs/dog.3927.jpg,dog.3927.jpg,1
13640,dataset/train/train/cats/cat.3634.jpg,cat.3634.jpg,0
14018,dataset/train/train/cats/cat.6045.jpg,cat.6045.jpg,0
...,...,...,...
8670,dataset/train/train/dogs/dog.10821.jpg,dog.10821.jpg,1
11839,dataset/train/train/dogs/dog.5027.jpg,dog.5027.jpg,1
4013,dataset/train/train/dogs/dog.940.jpg,dog.940.jpg,1
21147,dataset/train/train/cats/cat.7233.jpg,cat.7233.jpg,0


In [45]:

# split df to 5 folds
from sklearn.model_selection import KFold
# new_df = df.copy(deep=True)
new_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [46]:
new_df

Unnamed: 0,filepath,id,labels
0,dataset/train/train/dogs/dog.11971.jpg,dog.11971.jpg,1
1,dataset/train/train/cats/cat.736.jpg,cat.736.jpg,0
2,dataset/train/train/dogs/dog.3927.jpg,dog.3927.jpg,1
3,dataset/train/train/cats/cat.3634.jpg,cat.3634.jpg,0
4,dataset/train/train/cats/cat.6045.jpg,cat.6045.jpg,0
...,...,...,...
24995,dataset/train/train/cats/cat.635.jpg,cat.635.jpg,0
24996,dataset/train/train/dogs/dog.5394.jpg,dog.5394.jpg,1
24997,dataset/train/train/dogs/dog.1348.jpg,dog.1348.jpg,1
24998,dataset/train/train/cats/cat.3894.jpg,cat.3894.jpg,0


In [52]:
new_df['fold'] = -1

In [54]:
for fold, (train_index, val_index) in enumerate(kf.split(new_df)):
    new_df.loc[val_index, 'fold'] = fold


In [56]:
new_df['fold'].value_counts()

1    5000
3    5000
2    5000
4    5000
0    5000
Name: fold, dtype: int64

In [58]:
# save the dataframe to csv
new_df.to_csv('train_labels_5folds.csv', index=False)

In [41]:
new_df.iloc[test]

Unnamed: 0,filepath,id,labels
9,dataset/train/train/dogs/dog.1542.jpg,dog.1542.jpg,1
11,dataset/train/train/dogs/dog.6650.jpg,dog.6650.jpg,1
13,dataset/train/train/dogs/dog.3150.jpg,dog.3150.jpg,1
16,dataset/train/train/dogs/dog.4238.jpg,dog.4238.jpg,1
24,dataset/train/train/dogs/dog.3405.jpg,dog.3405.jpg,1
...,...,...,...
24981,dataset/train/train/cats/cat.3043.jpg,cat.3043.jpg,0
24982,dataset/train/train/cats/cat.12136.jpg,cat.12136.jpg,0
24989,dataset/train/train/cats/cat.7931.jpg,cat.7931.jpg,0
24996,dataset/train/train/cats/cat.3532.jpg,cat.3532.jpg,0


In [32]:
new_df

Unnamed: 0,filepath,id,labels
0,1,1,1
1,3,3,3
2,3,3,3
3,1,1,1
4,1,1,1
...,...,...,...
24995,2,2,2
24996,4,4,4
24997,1,1,1
24998,4,4,4


In [24]:
train_df = new_df.iloc[train]

In [26]:
train_df['labels'].value_counts()

0    10016
1     9984
Name: labels, dtype: int64

array([   17,    29,    30, ..., 24986, 24988, 24999])

In [27]:
test_df = new_df.iloc[test]
test_df['labels'].value_counts()

1    2516
0    2484
Name: labels, dtype: int64

In [None]:
# disable warnings
import warnings
warnings.filterwarnings('ignore')
# plot the distribution of labels
sns.countplot(train_df['labels'])
# as pie
plt.show()
plt.pie(train_df['labels'].value_counts(), labels=train_df['labels'].value_counts().index, autopct='%1.1f%%')
plt.show()
plt.pie(val_df['labels'].value_counts(), labels=val_df['labels'].value_counts().index, autopct='%1.1f%%', shadow=True)
plt.show()


In [None]:
# plot the first batch of images
fig, ax = plt.subplots(1, len(train_features), figsize=(20, 4))
for i in range(len(train_features)):
    ax[i].imshow(train_features[i].permute(1, 2, 0).numpy())
    ax[i].set_title(train_labels[i].item())
    ax[i].axis('off')
