In [6]:
! pip install -q kaggle

from google.colab import files

# import your kaggle credentials in kaggle.json file sa explained in:
# https://www.kaggle.com/general/74235
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"micharosa","key":"c34f6df8dda23f417041431ae70e46c8"}'}

In [7]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# check if kaggle API works
! kaggle datasets list

ref                                                            title                                                size  lastUpdated          downloadCount  
-------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  
utkarshxy/who-worldhealth-statistics-2020-complete             World Health 2020 🌏 | For Geospatial Analysis         1MB  2021-01-06 16:22:50            859  
gpreda/pfizer-vaccine-tweets                                   Pfizer Vaccine Tweets                               403KB  2021-01-06 15:11:07            607  
google/android-smartphones-high-accuracy-datasets              Android smartphones high accuracy GNSS datasets       1GB  2020-12-23 01:51:11            127  
ashkhagan/women-representation-in-city-property-sanfrancisco   Women Representation in City Property SanFrancisco    3KB  2020-12-13 05:18:14            131  
arashnic/covid19-case-surveillance-public-use-

In [8]:
! kaggle datasets download jerzydziewierz/bee-vs-wasp --unzip

Downloading bee-vs-wasp.zip to /content
 98% 546M/559M [00:03<00:00, 174MB/s]
100% 559M/559M [00:03<00:00, 159MB/s]


In [9]:
import os
import numpy as np
import pandas as pd
import glob
import imgaug as ia
import imgaug.augmenters as iaa
import imgaug.parameters as iap
from imgaug.augmenters import Sequential
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms.functional as tf

## Wczytanie danych

In [10]:
data = pd.read_csv("/content/kaggle_bee_vs_wasp/labels.csv")
for i in data.index:
    data["path"].iloc[i] = data["path"].iloc[i].replace("\\", "/")
le = LabelEncoder()
le.fit(data["label"])
data["label"] = le.transform(data["label"])
data.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11421 entries, 0 to 11420
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   11421 non-null  int64 
 1   path                 11421 non-null  object
 2   is_bee               11421 non-null  int64 
 3   is_wasp              11421 non-null  int64 
 4   is_otherinsect       11421 non-null  int64 
 5   is_other             11421 non-null  int64 
 6   photo_quality        11421 non-null  int64 
 7   is_validation        11421 non-null  int64 
 8   is_final_validation  11421 non-null  int64 
 9   label                11421 non-null  int64 
dtypes: int64(9), object(1)
memory usage: 892.4+ KB


## Podział danych 

In [11]:
def split_data(dataset):
    index = list()
    validation = pd.DataFrame()
    final_validation = pd.DataFrame()
    for i in data.index:
        if dataset["is_validation"].iloc[i] == 1:
            validation = validation.append(dataset.iloc[i])
            index.append(i)
        if dataset["is_final_validation"].iloc[i] == 1:    
            final_validation = final_validation.append(dataset.iloc[i])
            index.append(i)

    dataset = dataset.drop(dataset.index[index])
    dataset = dataset.reset_index()
    validation = validation.reset_index()
    final_validation = final_validation.reset_index()
    return dataset, validation, final_validation 

train_df, val_df, test_df = split_data(data)

# sanity check
print("Length of train dataset: ", len(train_df))
print("Length of validation dataset: " ,len(val_df))
print("Length of test dataset: ", len(test_df))

Length of train dataset:  7939
Length of validation dataset:  1719
Length of test dataset:  1763


## Augmentacja

In [19]:
class Transforms():
    def __init__(self, train: bool = False):
      self.train = train
    
    def rotate(self, image, angle):
      angle = random.uniform(-angle, +angle)

      transformation_matrix = torch.tensor([
          [+cos(radians(angle)), -sin(radians(angle))], 
          [+sin(radians(angle)), +cos(radians(angle))]
      ])

      image = imutils.rotate(np.array(image), angle)

      return Image.fromarray(image)

    def resize(self, image, img_size):
      image = tf.resize(image, img_size)
      return image

    def color_jitter(self, image, landmarks):
      color_jitter = transforms.ColorJitter(brightness=0.3, 
                                            contrast=0.3,
                                            saturation=0.3, 
                                            hue=0.1)
      image = color_jitter(image)
      return image

    def crop(self, image, crops):
      left = int(crops['left'])
      top = int(crops['top'])
      width = int(crops['width'])
      height = int(crops['height'])

      image = tf.crop(image, top, left, height, width)

      img_shape = np.array(image).shape
      return image

    def __call__(self, image, crops):
      image = Image.fromarray(image)
      
      image = self.resize(image,  (224, 224))
      if self.train:
        image = self.crop(image, crops)
        image = self.resize(image,  (224, 224))
        image = self.color_jitter(image)
        image = self.rotate(image, angle=10)

      image = tf.to_tensor(image)
      image = tf.normalize(image, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
      return image

## Dataset

In [30]:
class BeeWaspDataset(Dataset):
  def __init__(self, image_dir: str = None, dataframe: pd.DataFrame = None, train: bool = False,
               transforms: Transforms = None):
    self.image_dir = image_dir
    self.dataframe = dataframe
    self.train = train
    self.transforms = transforms

  def __getitem__(self, index):
    image_path = os.path.join(self.image_dir, self.dataframe.iloc[index]["path"])
    image = cv2.imread(image_path)
    
    if self.transform:
      image = self.transform(image)
    
    if self.train:
      label = self.dataframe.iloc[index]["label"]
      return image, label
    else: 
      return image

  def __len__(self):
      return len(self.dataframe)

## Przygotowanie datasetów do treningu

In [31]:
train_data = BeeWaspDataset(dataframe=train_df,
                            image_dir="/content/kaggle_bee_vs_wasp/",
                            train=True,
                            transforms=Transforms(train=True))

val_data = BeeWaspDataset(dataframe=val_df,
                          image_dir="/content/kaggle_bee_vs_wasp/",
                          train=True,
                          transforms=Transforms(train=False))

test_data = BeeWaspDataset(dataframe=test_df,
                          image_dir="/content/kaggle_bee_vs_wasp/",
                          train=True,
                          transforms=Transforms(train=False))

train_loader = DataLoader(dataset=train_data, shuffle=True, batch_size=32, num_workers=4)
val_loader = DataLoader(dataset=val_data, shuffle=True, batch_size=32, num_workers=4)
test_loader = DataLoader(dataset=test_data, shuffle=True, batch_size=32, num_workers=4)