In [1]:
import cv2
import os
import pickle
import numpy as np
import PIL as pil
from tqdm.auto import tqdm

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Drive files

1. Aceptar invitación a la unidad compartida Patrones2022

2. Abrir la unidad, y sobre la carpeta `home`, hacer click derecho y luego "Añadir acceso directo a Drive".

Si eso funcionó, al ejecutar la siguiente celda, debería verse `db_caracteristicas raw_database  trained_models` en el output.

In [3]:
HOME = "/content/drive/My Drive/home/"
!ls "{HOME}"

db_caracteristicas  pickled_database  raw_database  trained_models


In [4]:
!ls "{HOME}/raw_database"

Bicicletas  Cachipun  Espinas  Letras  Lunares


In [5]:
def LoadImage(path: str, cmap: str = 'gray', echo: bool = True) -> np.ndarray:
    """
    Load an image from a path
    """
    if cmap == 'gray':
      cflag = cv2.IMREAD_GRAYSCALE
    elif cmap == 'rgb':
      cflag = cv2.IMREAD_COLOR
    else:
      print(f"{cmap} is not a valid option")
      raise AttributeError
    if echo:
      print("Image: " + path)
    img = cv2.imread(path, cflag)
    if echo:
      print("Image size:", img.shape)
    return img


def NofClasses(path: str) -> int:
    """
    Get the classes in a directory
    """
    return len(os.listdir(path)) - 1


def NofSamples(path: str) -> list:
    """
    Get the number of samples for each class
    """
    samples = []
    for subdir in os.listdir(path):
        samples.append(len(os.listdir(path + subdir)))
    return samples


def GetMinDim(path: str) -> tuple:
  """
  Returns the smallest dimensions from every image in path.
  Path must be a nonempty folder, with at least one folder with images.
  """
  minh = None
  minw = None
  for dir in os.listdir(path):
    for fil in os.listdir(path+dir):
      h, w = LoadImage(path + dir + '/' + fil, cmap=cmap, echo=echo).shape
      if not minh or h < minh:
        minh = h
      if not minw or w < minw:
        minw = w
      break
  return (minw, minh)


def BuildDataset(path: str, cmap: str = 'gray', echo: bool = False) -> tuple:
    """
    Build a dataset from a directory, returns a tuple (X, y, #clas, [#sam])
    """
    imdim = GetMinDim(path)
    if echo:
      print(f"Smallest image size: {imdim}")
    classes = NofClasses(path)
    samples = NofSamples(path)
    Xsam = np.zeros((sum(samples), imdim[1], imdim[0]))
    Ysam = np.zeros((sum(samples), 1))
    i = 0
    ii = 0
    echo = True
    for dir in os.listdir(path):
        for fil in tqdm(os.listdir(path + dir)):
            if fil == '.DS_Store':
              continue
            img = LoadImage(path + dir + '/' + fil, cmap=cmap, echo=echo)
            img = cv2.resize(img, imdim, interpolation = cv2.INTER_AREA) 
            Xsam[ii] = img
            echo = False
            Ysam[ii] = i
            ii += 1
        i += 1
    return (Xsam, Ysam, classes, samples)
    

In [7]:
%%script echo skipping

datasets = ["Bicicletas", "Cachipun", "Espinas", "Letras", "Lunares"]

for DSNAME in datasets:
  out = BuildDataset(f"{HOME}raw_database/{DSNAME}/", echo=True)
  Xsam, Ysam, clas, sam = out

  pick_insert = open(f'{HOME}pickled_database/{DSNAME}/Xsam.pkl', 'wb')
  pickle.dump(Xasam, pick_insert)
  pick_insert.close()

  pick_insert = open(f'{HOME}pickled_database/{DSNAME}/Ysam.pkl', 'wb')
  pickle.dump(Ysam, pick_insert)
  pick_insert.close()

  pick_insert = open(f'{HOME}pickled_database/{DSNAME}/clas.pkl', 'wb')
  pickle.dump(clas, pick_insert)
  pick_insert.close()

  pick_insert = open(f'{HOME}pickled_database/{DSNAME}/sam.pkl', 'wb')
  pickle.dump(sam, pick_insert)
  pick_insert.close()

skipping


## Bloque 4
- Split Train-Validation
- Aplicación secuencial de alguna selección/transformación
  - Clean
  - MinMax Scaling
  - SelectKBest
  - SFS
  - PCA
  - ICA

In [4]:
from IPython.display import clear_output
!pip3 install scipy==1.2
!pip3 install pybalu==0.2.5
clear_output()

In [5]:
from sklearn.model_selection import train_test_split
from pybalu.feature_selection import clean
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from pybalu.feature_selection import sfs
from sklearn.decomposition import PCA, FastICA

In [6]:
class CleanInterface:

  def __init__(self, X):
    self.model = clean(X)

  def transform(self, X):
    return X[:, self.model]

class SFSInterface:

  def __init__(self, X, y, s):
    self.model = sfs(X, y, s, show=False)

  def transform(self, X):
    return X[:, self.model]

In [66]:
class Block4:

  def __init__(self, X, y, sequence, ratio=0.3):

    self.sequence = sequence

    self.models = []

    self.Xtrain, self.Xval, self.ytrain, self.yval = train_test_split(np.array(X), y, test_size=ratio, random_state=42, stratify=y)

    self.Xtrain = self.interative_fit()
    self.Xval = self.transform(self.Xval)

  def interative_fit(self):

    X = self.Xtrain

    for seq in self.sequence:

      name = seq.split('-')[0]

      if name == 'CLEAN':
        model = CleanInterface(X)
      elif name == 'MINMAX':
        model = MinMaxScaler().fit(X)
      else: 
        param = int(seq.split('-')[1])
        if name == 'KBEST':
          model = SelectKBest(chi2, k=param).fit(X, self.ytrain)
        elif name == 'SFS':
          model = SFSInterface(X, self.ytrain, param)
        elif name == 'PCA':
          model = PCA(n_components=param).fit(X)
        elif name == 'ICA':
          model = FastICA(n_components=param, random_state=0).fit(X, self.ytrain)
        else:
          model = None
          print(f'No existe el modelo {name}')

      self.models.append(model)
      X = model.transform(X)

    return X
  
  def transform(self, X):

    for model in self.models:
      X = model.transform(X)
    return X

In [71]:
# DATOS DE PRUEBA
!gdown --id 1CA-l9_JjdjG_4kTuavKf8Wm27dt0jyqT
clear_output()

f = open('data.p', "rb")
data = pickle.load(f)
X = data['train']

y = np.array([0 if i < 7000 else 1 for i in range(0, 14000)])

X.shape, len(y)

((14000, 1844), 14000)

In [72]:
block = Block4(np.array(X), y, ['CLEAN', 'MINMAX', 'KBEST-50', 'SFS-10', 'PCA-5', 'ICA-2'])
block.Xtrain.shape, block.Xval.shape

(9800, 1844) (4200, 1844) (9800,) (4200,)


((9800, 2), (4200, 2))