<a href="https://colab.research.google.com/github/urmzd/rabbit-classifier/blob/main/src/rc_e1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
%pip install loguru

You should consider upgrading via the '/home/urmzd/.pyenv/versions/3.8-dev/envs/rabbit-classifier/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


# Data Exploration

In [30]:
%mkdir -p /tmp
%curl https://raw.githubusercontent.com/urmzd/rabbit-classifier/main/resources/data.csv --output /tmp/data.csv

UsageError: Line magic function `%curl` not found.


In [31]:
import pandas as pd
import requests
import re
import pathlib
from loguru import logger

DATA_PATH="/tmp/data.csv"
IMAGE_PATH="/tmp/images"

def get_data():
  df = pd.read_csv(DATA_PATH, usecols=range(2))
  return df

def download_image(link: str) -> pathlib.Path:
  # view  : https://regex101.com/r/3bhDMM/1
  # delete: https://regex101.com/delete/N5sItwbrPF73ZllTnRDltxZ1
  file_name_regex = re.compile(r".*\/(.*(\.(jpeg|jpg|png))?)\??.*", flags=re.IGNORECASE)
  regex_matches = file_name_regex.match(link)

  if not regex_matches:
    raise Exception(f"Failed to match file_name for link {link}")

  if len(regex_matches.groups()) < 3:
    file_name = regex_matches.group(1) + ".png"
  else:
    file_name = regex_matches.group(1)

  content_path = pathlib.Path(IMAGE_PATH)
  content_path.mkdir(parents=True, exist_ok=True)

  file_path = content_path / file_name

  if file_path.exists():
    return file_path

  image_request_headers={
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
  }

  image = requests.get(link, headers=image_request_headers)

  with open(file_path, "wb") as handle:
    handle.write(image.content)

  return file_path

df = get_data()
df.head()

Unnamed: 0,label,link
0,rabbit,https://upload.wikimedia.org/wikipedia/commons...
1,rabbit,https://upload.wikimedia.org/wikipedia/commons...
2,rabbit,https://www.massaudubon.org/var/ezdemo_site/st...
3,rabbit,https://www.welcomewildlife.com/wp-content/upl...
4,rabbit,https://i.natgeofe.com/k/58df97a7-5c47-44b8-97...


In [32]:
import cv2
from typing import NewType, Tuple, List
import numpy as np
# from google.colab.patches import cv2_imshow

Image = NewType('Image', np.array)
Label = NewType('Label', str)
ImageLabelPair = Tuple[Image, Label]

def get_image(file_path: pathlib.Path, show=False) -> Image:
  image = cv2.imread(str(file_path))

  if show:
    cv2.imshow(image)
    
  return Image(image)

RawX = NewType('RawX', List[Image])
RawY = NewType('RawY', np.ndarray)

def get_x_y(raw_data: pd.DataFrame) -> Tuple[RawX, RawY]:
  y = df.iloc[:, 0].to_numpy()
  logger.info(y)
  y = y.reshape(-1, 1)

  x_links = df.iloc[:, 1].tolist()
  x_paths = [download_image(link) for link in x_links]
  x = [get_image(path) for path in x_paths]

  return RawX(x), RawY(y)

In [33]:
from sklearn.preprocessing import OneHotEncoder
import albumentations as A
import tensorflow as tf
import matplotlib.pyplot as plt
import random

random.seed(42)


def get_x_y_preprocessors(x: RawX, y: RawY):  
  y_encoder = OneHotEncoder(sparse=False).fit(y)
  x_encoder = A.Compose([
                        #  A.ToGray(), 
                         A.LongestMaxSize(200), 
                         A.PadIfNeeded(min_height=200, min_width=200), 
                         A.Normalize()
                         ])

  return x_encoder, y_encoder

data = get_data()
x,y = get_x_y(data)

def get_processed_x_y(x: RawX, y: RawY):

  x_encoder,y_encoder = get_x_y_preprocessors(x, y)
  min_width = float('inf')
  for i, x_sample in enumerate(x):
    x_data = {"image": x_sample}
    encoded_object = x_encoder(**x_data)
    transformed_image = encoded_object["image"]
    normalized_transformed_image = transformed_image.astype(np.float32)[:,:,0]
    if i == 0:
      logger.info(normalized_transformed_image.dtype)
      logger.info(normalized_transformed_image.shape)
      logger.info(normalized_transformed_image)
      plt.imshow(normalized_transformed_image)

  encoder = OneHotEncoder(sparse=False)
  y_transformed: np.ndarray = encoder.fit_transform(y)

get_processed_x_y(x,y)

2022-03-14 21:30:14.083 | INFO     | __main__:get_x_y:23 - ['rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit'
 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit'
 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit'
 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit'
 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit'
 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'rabbit' 'hare' 'hare'
 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare'
 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare'
 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare'
 'hare' 'hare' 'hare' 'hare' 'hare' 'hare' 'hare']


# Data Processing