# Data Exploration

In [3]:
%mkdir -p /tmp
%curl https://raw.githubusercontent.com/urmzd/rabbit-classifier/main/resources/data.csv --output /tmp/data.csv

SyntaxError: invalid syntax (<ipython-input-3-79f8352f766e>, line 2)

In [None]:
import pandas as pd
import requests
import re
import pathlib
from loguru import logger

DATA_PATH="/tmp/data.csv"
IMAGE_PATH="/tmp/images"

def get_data():
  df = pd.read_csv(DATA_PATH, usecols=range(2))
  return df

def download_image(link: str) -> pathlib.Path:
  # view  : https://regex101.com/r/3bhDMM/1
  # delete: https://regex101.com/delete/N5sItwbrPF73ZllTnRDltxZ1
  file_name_regex = re.compile(r".*\/(.*(\.(jpeg|jpg|png))?)\??.*", flags=re.IGNORECASE)
  regex_matches = file_name_regex.match(link)

  if not regex_matches:
    raise Exception(f"Failed to match file_name for link {link}")

  if len(regex_matches.groups()) < 3:
    file_name = regex_matches.group(1) + ".png"
  else:
    file_name = regex_matches.group(1)

  content_path = pathlib.Path(IMAGE_PATH)
  content_path.mkdir(parents=True, exist_ok=True)

  file_path = content_path / file_name

  if file_path.exists():
    return file_path

  image_request_headers={
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
  }

  image = requests.get(link, headers=image_request_headers)

  with open(file_path, "wb") as handle:
    handle.write(image.content)

  return file_path

df = get_data()
df.head()

In [None]:
import cv2
from typing import NewType, Tuple, List
import numpy as np

Image = NewType('Image', np.array)
Label = NewType('Label', str)
ImageLabelPair = Tuple[Image, Label]

def get_image(file_path: pathlib.Path, show=False) -> Image:
  image = cv2.imread(str(file_path))

  if show:
    plt.imshow(image)
    
  return Image(image)

RawX = NewType('RawX', List[Image])
RawY = NewType('RawY', np.ndarray)

def get_x_y(raw_data: pd.DataFrame) -> Tuple[RawX, RawY]:
  y = raw_data.iloc[:, 0].to_numpy()
  y = y.reshape(-1, 1)

  x_links = raw_data.iloc[:, 1].tolist()
  x_paths = [download_image(link) for link in x_links]
  x = [get_image(path) for path in x_paths]

  return RawX(x), RawY(y)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import albumentations as A
import tensorflow as tf
import matplotlib.pyplot as plt
import random

random.seed(42)


def get_x_y_preprocessors(x: RawX, y: RawY):  
  y_encoder = OneHotEncoder(sparse=False).fit(y)

  x_means = []
  x_stds = []

  for feature_set in x:
    greyscale_feature_set = feature_set
    mean = np.mean(greyscale_feature_set, axis=(0,1))
    std = np.std(greyscale_feature_set, axis=(0,1))
    x_means.append(mean)
    x_stds.append(std)

  norm_mean = np.mean(x_means, axis=0)
  norm_std = np.mean(x_stds, axis=0)

  logger.info(f"Normalization Mean: {norm_mean}")
  logger.info(f"Normalization Std: {norm_std}")

  x_encoder = A.Compose([
                        #  A.ToGray(), 
                         A.LongestMaxSize(200), 
                         A.PadIfNeeded(min_height=200, min_width=200), 
                         A.Normalize(mean=norm_mean, std=norm_std),
                         ])

  return x_encoder, y_encoder

def get_processed_x_y(x: RawX, y: RawY):

  x_encoder, y_encoder = get_x_y_preprocessors(x, y)

  x_encoded = []

  for sample in x:
    x_data = {"image": sample}
    encoded_object = x_encoder(**x_data)
    transformed_image = encoded_object["image"]
    # normalized_transformed_image = np.clip(transformed_image, 0., 1.)
    normalized_transformed_image = transformed_image.astype(np.uint8)
    x_encoded.append(normalized_transformed_image)

  y_transformed: np.ndarray = y_encoder.transform(y)

  return x_encoded, y_transformed

In [None]:
data = get_data()
x, y = get_x_y(data)

In [None]:
image = get_processed_x_y(x, y)[0][-1]
logger.info(f"MIN: , {np.min(image)}")
logger.info(f"MAX: , {np.max(image)}")
plt.imshow(image)

# Data Processing