#**Практическое задание №1**

In [None]:
!pip install -q tqdm
!pip install --upgrade --no-cache-dir gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
EVALUATE_ONLY = False
TEST_ON_LARGE_DATASET = True
TISSUE_CLASSES = ('ADI', 'BACK', 'DEB', 'LYM', 'MUC', 'MUS', 'NORM', 'STR', 'TUM')
DATASETS_LINKS = {
    # Закоментированные строки из оригинального ноутбука
    # К сожалению оригинальные ссылки не работают из-за большого кол-ва скачиваний
    # 'train': '1XtQzVQ5XbrfxpLHJuL0XBGJ5U7CS-cLi',
    # https://drive.google.com/file/d/1ccAgGUs43hA6hf9rpV8fi84VLv_2uW8a/view?usp=sharing
    'train': '1ccAgGUs43hA6hf9rpV8fi84VLv_2uW8a',
    'train_small': '1qd45xXfDwdZjktLFwQb-et-mAaFeCzOR',
    # 'train_tiny': '1I-2ZOuXLd4QwhZQQltp817Kn3J0Xgbui',
    # https://drive.google.com/file/d/18jKz6GfnilfIYZHT-sASvPfU1BH6p2OU/view?usp=drive_link
    'train_tiny': '18jKz6GfnilfIYZHT-sASvPfU1BH6p2OU',
    # 'test': '1RfPou3pFKpuHDJZ-D9XDFzgvwpUBFlDr',
    # https://drive.google.com/file/d/1brH5TzbTNUPKz3yoWS_RD4FW1xJc-dEK/view?usp=sharing
    'test': '1brH5TzbTNUPKz3yoWS_RD4FW1xJc-dEK',
    'test_small': '1wbRsog0n7uGlHIPGLhyN-PMeT2kdQ2lI',
    # 'test_tiny': '1viiB0s041CNsAK4itvX8PnYthJ-MDnQc'
    # https://drive.google.com/file/d/1bOavoin0mTiBhx8AYZhIkAa3YhEinbLa/view?usp=sharing
    'test_tiny': '1bOavoin0mTiBhx8AYZhIkAa3YhEinbLa'
}
IMG_HEIGHT = 224
IMG_WIDTH = 224

BATCH_SIZE = 16
SHUFFLE_BUFFER_SIZE = 100
TRAIN_RATIO = 0.8

In [None]:
from pathlib import Path
import numpy as np
from typing import List
from tqdm.notebook import tqdm
from time import sleep
from PIL import Image
import IPython.display
from sklearn.metrics import balanced_accuracy_score
import gdown
import cv2

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint

2023-11-27 09:29:01.442637: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

class Dataset:

    def __init__(self, name, local = True):
        self.name = name
        self.is_loaded = False
        if not local:
            url = f"https://drive.google.com/uc?export=download&confirm=pbef&id={DATASETS_LINKS[name]}"
            output = f'{name}.npz'
            gdown.download(url, output, quiet=False)
            print(f'Loading dataset {self.name} from npz.')
            np_obj = np.load(f'{name}.npz')
            self.images = np_obj['data']
            self.labels = np_obj['labels']
            self.n_files = self.images.shape[0]
            self.is_loaded = True
        else:
            np_obj = np.load(f'datasets/{name}.npz')
            self.images = np_obj['data']
            self.labels = np_obj['labels']
            self.n_files = self.images.shape[0]
            self.is_loaded = True
        print(f'Done. Dataset {name} consists of {self.n_files} images.')
        self.train_inds = np.random.choice(self.n_files, int(self.n_files * TRAIN_RATIO), replace=False)
        self.val_inds = np.setdiff1d(np.arange(self.n_files), self.train_inds)

    def image(self, i):
        # read i-th image in dataset and return it as numpy array
        if self.is_loaded:
            return self.images[i, :, :, :]

    def images_seq(self, n=None):
        # sequential access to images inside dataset (is needed for testing)
        for i in range(self.n_files if not n else n):
            yield self.image(i)

    def random_image_with_label(self):
        # get random image with label from dataset
        i = np.random.randint(self.n_files)
        return self.image(i), self.labels[i]

    def random_batch_with_labels(self, n):
        # create random batch of images with labels (is needed for training)
        indices = np.random.choice(self.n_files, n)
        imgs = []
        for i in indices:
            img = self.image(i)
            imgs.append(self.image(i))
        logits = np.array([self.labels[i] for i in indices])
        return np.stack(imgs), logits

    def random_batch_from_train_val(self, n, set_name='train'):
        if set_name == 'train':
            indices = np.random.choice(self.train_inds, n)
        else:
            indices = np.random.choice(self.val_inds, n)
        imgs = []
        for i in indices:
            img = self.image(i)
            imgs.append(self.image(i))
        logits = np.array([self.labels[i] for i in indices])
        return np.stack(imgs), logits


    def image_with_label(self, i: int):
        # return i-th image with label from dataset
        return self.image(i), self.labels[i]

    def train_val_index_split(self, train_ratio=0.8):
        self.train_inds = np.random.choice(self.n_files, int(self.n_files * train_ratio), replace=False)
        self.val_inds = np.setdiff1d(np.arange(self.n_files), self.train_inds)


    def train_val_split(self, seed: int):
        return tf.data.Dataset.from_tensor_slices((self.images, self.labels))
        # train_dataset = tf.data.Dataset.from_tensor_slices((self.images, self.labels))
        # train_ds, val_ds = keras.utils.split_dataset(train_dataset, left_size=0.8, shuffle=True)
        # train_ds = train_ds.cache().shuffle(AUTOTUNE).batch(BATCH_SIZE)
        # val_ds = val_ds.cache().batch(BATCH_SIZE)
        # return train_ds, val_ds


In [None]:
class MySequence(tf.keras.utils.Sequence):
    def __init__(self, dataset: Dataset, set_name='train') -> None:
      super().__init__()
      self.dataset = dataset
      self.leny = dataset.n_files // BATCH_SIZE
      self.set_name = set_name

    def __len__(self):
      return self.leny

    def __getitem__(self, idx):
      return self.dataset.random_batch_from_train_val(BATCH_SIZE, self.set_name)

In [None]:
class Metrics:

    @staticmethod
    def accuracy(gt: List[int], pred: List[int]):
        assert len(gt) == len(pred), 'gt and prediction should be of equal length'
        return sum(int(i[0] == i[1]) for i in zip(gt, pred)) / len(gt)

    @staticmethod
    def accuracy_balanced(gt: List[int], pred: List[int]):
        return balanced_accuracy_score(gt, pred)

    @staticmethod
    def print_all(gt: List[int], pred: List[int], info: str):
        print(f'metrics for {info}:')
        print('\t accuracy {:.4f}:'.format(Metrics.accuracy(gt, pred)))
        print('\t balanced accuracy {:.4f}:'.format(Metrics.accuracy_balanced(gt, pred)))

In [None]:
class Model:

    def __init__(self):
        num_classes = len(TISSUE_CLASSES)

        data_augmentation = keras.Sequential(
            [
                layers.RandomFlip("horizontal",
                                input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
                layers.RandomRotation(0.1),
                layers.RandomZoom(0.1),
            ]
        )

        efficient_netb0 = keras.applications.EfficientNetB0(include_top=True, weights=None, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))

        self.model = Sequential()
        self.model.add(data_augmentation)
        self.model.add(efficient_netb0)
        self.model.add(layers.Dropout(0.2))
        self.model.add(layers.Dense(128, activation='relu', kernel_regularizer='l2'))
        self.model.add(layers.Dense(num_classes))

        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

        self.model.summary()

    def save(self, name: str):
        pass

    def load(self, name: str):
        self.model = keras.models.load_model(f"{name}.tf")
        # example demonstrating loading the model with name 'name' from gdrive using link
        # name_to_id_dict = {
        #     'best': '1S8bwrVgvtSzadEX2aLlyb3VTlD31UI4R'
        # }
        # output = f'{name}.npz'
        # gdown.download(f'https://drive.google.com/uc?id={name_to_id_dict[name]}', output, quiet=False)
        # np_obj = np.load(f'{name}.npz')
        # print(np_obj['data'])

    def train(self, dataset: Dataset):
        train_seq = MySequence(dataset, set_name = 'train')
        val_seq = MySequence(dataset, set_name = 'val')

        checkpoint = ModelCheckpoint("best_model27.tf", monitor='loss', verbose=1,
                    save_best_only=True, mode='auto', period=1)

        epochs=30
        self.history = self.model.fit(
            train_seq,
            validation_data=val_seq,
            epochs=epochs,
            verbose=1,
            # steps_per_epoch = 15,
            # validation_steps = 7,
            callbacks=[checkpoint]
        )

    def train_tmp(self, train_ds):
        checkpoint = ModelCheckpoint("best_model.tf", monitor='loss', verbose=1,
                    save_best_only=True, mode='auto', period=1)

        epochs=1
        self.history = self.model.fit(
            train_ds,
            epochs=epochs,
            verbose=1,
            # steps_per_epoch = 15,
            # validation_steps = 7,
            callbacks=[checkpoint]
        )

    def continue_train(self, name: str, dataset: Dataset):
        self.load(name)
        self.train(dataset)

    def test_on_dataset(self, dataset: Dataset, limit=None):
        # you can upgrade this code if you want to speed up testing using batches
        # predictions = []
        # n = dataset.n_files if not limit else int(dataset.n_files * limit)
        # for img in tqdm(dataset.images_seq(n), total=n):
        #     predictions.append(self.test_on_image(img))
        # return predictions
        return self.model.predict(dataset.images).argmax(axis=-1)


    def test_on_image(self, img: np.ndarray):
        # todo: replace this code
        prediction = self.model.predict(img)
        return prediction


In [None]:
d_train = Dataset('train')
d_test = Dataset('test')

Done. Dataset train consists of 18000 images.
Done. Dataset test consists of 4500 images.


In [None]:
new_model = Model()
new_model.continue_train('best_model', d_train)

2023-11-27 09:29:35.106268: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3489 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 224, 224, 3)       0         
                                                                 
 efficientnetb0 (Functional  (None, 1000)              5330571   
 )                                                               
                                                                 
 dropout (Dropout)           (None, 1000)              0         
                                                                 
 dense (Dense)               (None, 128)               128128    
                                                                 
 dense_1 (Dense)             (None, 9)                 1161      
                                                                 
Total params: 5459860 (20.83 MB)
Trainable params: 5417837 (20.67 MB)
Non-trainable params: 42023 (164.16 KB)
__________

2023-11-27 09:29:56.827197: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape insequential_1/efficientnetb0/block2b_drop/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2023-11-27 09:29:58.572048: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8906
2023-11-27 09:29:58.918197: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x9b18180 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-27 09:29:58.918223: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1050, Compute Capability 6.1
2023-11-27 09:29:58.922200: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-27 09:29:59.025574: I ./tensorflow/compiler/jit/device_com

Epoch 1: loss improved from inf to 0.68129, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 2/30
Epoch 2: loss did not improve from 0.68129
Epoch 3/30
Epoch 3: loss did not improve from 0.68129
Epoch 4/30
Epoch 4: loss improved from 0.68129 to 0.67893, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 5/30
Epoch 5: loss did not improve from 0.67893
Epoch 6/30
Epoch 6: loss improved from 0.67893 to 0.66977, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 7/30
Epoch 7: loss improved from 0.66977 to 0.66198, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 8/30
Epoch 8: loss improved from 0.66198 to 0.65224, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 9/30
Epoch 9: loss did not improve from 0.65224
Epoch 10/30
Epoch 10: loss did not improve from 0.65224
Epoch 11/30
Epoch 11: loss improved from 0.65224 to 0.64629, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 12/30
Epoch 12: loss did not improve from 0.64629
Epoch 13/30
Epoch 13: loss did not improve from 0.64629
Epoch 14/30
Epoch 14: loss did not improve from 0.64629
Epoch 15/30
Epoch 15: loss did not improve from 0.64629
Epoch 16/30
Epoch 16: loss improved from 0.64629 to 0.64432, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 17/30
Epoch 17: loss did not improve from 0.64432
Epoch 18/30
Epoch 18: loss improved from 0.64432 to 0.63447, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 19/30
Epoch 19: loss improved from 0.63447 to 0.62576, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 20/30
Epoch 20: loss improved from 0.62576 to 0.60848, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 21/30
Epoch 21: loss did not improve from 0.60848
Epoch 22/30
Epoch 22: loss did not improve from 0.60848
Epoch 23/30
Epoch 23: loss improved from 0.60848 to 0.59641, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 24/30
Epoch 24: loss improved from 0.59641 to 0.58070, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 25/30
Epoch 25: loss did not improve from 0.58070
Epoch 26/30
Epoch 26: loss improved from 0.58070 to 0.57720, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 27/30
Epoch 27: loss improved from 0.57720 to 0.57019, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 28/30
Epoch 28: loss did not improve from 0.57019
Epoch 29/30
Epoch 29: loss improved from 0.57019 to 0.56824, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets


Epoch 30/30
Epoch 30: loss improved from 0.56824 to 0.55551, saving model to best_model27.tf
INFO:tensorflow:Assets written to: best_model27.tf/assets


INFO:tensorflow:Assets written to: best_model27.tf/assets




In [None]:
model = Model()
model.load('best_model27')
pred_1 = model.test_on_dataset(d_test, limit=0.1)
Metrics.print_all(d_test.labels, pred_1, 'test')

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, 224, 224, 3)       0         
                                                                 
 efficientnetb0 (Functional  (None, 1000)              5330571   
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, 1000)              0         
                                                                 
 dense_2 (Dense)             (None, 128)               128128    
                                                                 
 dense_3 (Dense)             (None, 9)                 1161      
                                                                 
Total params: 5459860 (20.83 MB)
Trainable params: 5417837 (20.67 MB)
Non-trainable params: 42023 (164.16 KB)
__________

2023-11-27 13:48:52.001543: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 677376000 exceeds 10% of free system memory.
2023-11-27 13:48:52.490769: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 677376000 exceeds 10% of free system memory.


metrics for test:
	 accuracy 0.9600:
	 balanced accuracy 0.9600:


In [None]:
model = Model()
if not EVALUATE_ONLY:
    model.train(d_train)
    model.save('best_model')
else:
    #todo: your link goes here
    model.load('best_model')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional  (None, 1000)              5330571   
 )                                                               
                                                                 
 dropout (Dropout)           (None, 1000)              0         
                                                                 
 dense (Dense)               (None, 128)               128128    
                                                                 
 dense_1 (Dense)             (None, 9)                 1161      
                                                                 
Total params: 5459860 (20.83 MB)
Trainable params: 5417837 (20.67 MB)
Non-trainable params: 42023 (164.16 KB)
_________________________________________________________________


Пример тестирования модели на части набора данных:

In [None]:
# evaluating model on 10% of test dataset

pred_1 = model.test_on_dataset(d_test, limit=0.1)
Metrics.print_all(d_test.labels[:len(pred_1)], pred_1, '10% of test')

metrics for 10% of test:


TypeError: ignored

In [None]:
pred_1 = model.test_on_dataset(d_test, limit=0.1)
Metrics.print_all(d_test.labels, pred_1, 'test')

metrics for test:


TypeError: ignored

In [None]:
pred_1

array([[-5.0413021e-04, -2.4318264e-03, -1.7521996e-03,  3.1518488e-05,
         2.1762779e-04,  2.0734109e-03, -5.3923373e-04, -1.3230898e-03,
        -1.2664620e-03],
       [-5.0413015e-04, -2.4318269e-03, -1.7521998e-03,  3.1518837e-05,
         2.1762872e-04,  2.0734090e-03, -5.3923327e-04, -1.3230912e-03,
        -1.2664617e-03],
       [-5.0412992e-04, -2.4318260e-03, -1.7521998e-03,  3.1519710e-05,
         2.1762837e-04,  2.0734109e-03, -5.3923286e-04, -1.3230894e-03,
        -1.2664617e-03],
       [-5.0413038e-04, -2.4318271e-03, -1.7521987e-03,  3.1517353e-05,
         2.1762773e-04,  2.0734104e-03, -5.3923368e-04, -1.3230905e-03,
        -1.2664614e-03],
       [-5.0413178e-04, -2.4318264e-03, -1.7522005e-03,  3.1519274e-05,
         2.1762855e-04,  2.0734114e-03, -5.3923245e-04, -1.3230873e-03,
        -1.2664618e-03],
       [-5.0413166e-04, -2.4318276e-03, -1.7522029e-03,  3.1521864e-05,
         2.1763152e-04,  2.0734114e-03, -5.3923490e-04, -1.3230838e-03,
        -1.

Пример тестирования модели на полном наборе данных:

In [None]:
# evaluating model on full test dataset (may take time)
if TEST_ON_LARGE_DATASET:
    pred_2 = model.test_on_dataset(d_test)
    Metrics.print_all(d_test.labels, pred_2, 'test')

ValueError: ignored

Результат работы пайплайна обучения и тестирования выше тоже будет оцениваться. Поэтому не забудьте присылать на проверку ноутбук с выполнеными ячейками кода с демонстрациями метрик обучения, графиками и т.п. В этом пайплайне Вам необходимо продемонстрировать работу всех реализованных дополнений, улучшений и т.п.

<font color="red">
Настоятельно рекомендуется после получения пайплайна с полными результатами обучения экспортировать ноутбук в pdf (файл -> печать) и прислать этот pdf вместе с самим ноутбуком.
</font>

### Тестирование модели на других наборах данных

Ваша модель должна поддерживать тестирование на других наборах данных. Для удобства, Вам предоставляется набор данных test_tiny, который представляет собой малую часть (2% изображений) набора test. Ниже приведен фрагмент кода, который будет осуществлять тестирование для оценивания Вашей модели на дополнительных тестовых наборах данных.

<font color="red">
Прежде чем отсылать задание на проверку, убедитесь в работоспособности фрагмента кода ниже.
</font>

In [None]:
final_model = Model()
final_model.load('best')
d_test_tiny = Dataset('test_tiny')
pred = model.test_on_dataset(d_test_tiny)
Metrics.print_all(d_test_tiny.labels, pred, 'test-tiny')

Отмонтировать Google Drive.

In [None]:
drive.flush_and_unmount()

---
# Дополнительные "полезности"

Ниже приведены примеры использования различных функций и библиотек, которые могут быть полезны при выполнении данного практического задания.

### Измерение времени работы кода

Измерять время работы какой-либо функции можно легко и непринужденно при помощи функции timeit из соответствующего модуля:

In [None]:
import timeit

def factorial(n):
    res = 1
    for i in range(1, n + 1):
        res *= i
    return res


def f():
    return factorial(n=1000)

n_runs = 128
print(f'Function f is caluclated {n_runs} times in {timeit.timeit(f, number=n_runs)}s.')

### Scikit-learn

Для использования "классических" алгоритмов машинного обучения рекомендуется использовать библиотеку scikit-learn (https://scikit-learn.org/stable/). Пример классификации изображений цифр из набора данных MNIST при помощи классификатора SVM:

In [None]:
# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split

# The digits dataset
digits = datasets.load_digits()

# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset.  If we were working from image files, we could load them using
# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
_, axes = plt.subplots(2, 4)
images_and_labels = list(zip(digits.images, digits.target))
for ax, (image, label) in zip(axes[0, :], images_and_labels[:4]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
    data, digits.target, test_size=0.5, shuffle=False)

# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)

# Now predict the value of the digit on the second half:
predicted = classifier.predict(X_test)

images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for ax, (image, prediction) in zip(axes[1, :], images_and_predictions[:4]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Prediction: %i' % prediction)

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, predicted)))
disp = metrics.plot_confusion_matrix(classifier, X_test, y_test)
disp.figure_.suptitle("Confusion Matrix")
print("Confusion matrix:\n%s" % disp.confusion_matrix)

plt.show()

### Scikit-image

Реализовывать различные операции для работы с изображениями можно как самостоятельно, работая с массивами numpy, так и используя специализированные библиотеки, например, scikit-image (https://scikit-image.org/). Ниже приведен пример использования Canny edge detector.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage as ndi

from skimage import feature


# Generate noisy image of a square
im = np.zeros((128, 128))
im[32:-32, 32:-32] = 1

im = ndi.rotate(im, 15, mode='constant')
im = ndi.gaussian_filter(im, 4)
im += 0.2 * np.random.random(im.shape)

# Compute the Canny filter for two values of sigma
edges1 = feature.canny(im)
edges2 = feature.canny(im, sigma=3)

# display results
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(8, 3),
                                    sharex=True, sharey=True)

ax1.imshow(im, cmap=plt.cm.gray)
ax1.axis('off')
ax1.set_title('noisy image', fontsize=20)

ax2.imshow(edges1, cmap=plt.cm.gray)
ax2.axis('off')
ax2.set_title(r'Canny filter, $\sigma=1$', fontsize=20)

ax3.imshow(edges2, cmap=plt.cm.gray)
ax3.axis('off')
ax3.set_title(r'Canny filter, $\sigma=3$', fontsize=20)

fig.tight_layout()

plt.show()

### Tensorflow 2

Для создания и обучения нейросетевых моделей можно использовать фреймворк глубокого обучения Tensorflow 2. Ниже приведен пример простейшей нейроной сети, использующейся для классификации изображений из набора данных MNIST.

In [None]:
# Install TensorFlow

import tensorflow as tf

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)

model.evaluate(x_test,  y_test, verbose=2)

<font color="red">
Для эффективной работы с моделями глубокого обучения убедитесь в том, что в текущей среде Google Colab используется аппаратный ускоритель GPU или TPU. Для смены среды выберите "среда выполнения" -> "сменить среду выполнения".
</font>

Большое количество туториалов и примеров с кодом на Tensorflow 2 можно найти на официальном сайте https://www.tensorflow.org/tutorials?hl=ru.

Также, Вам может понадобиться написать собственный генератор данных для Tensorflow 2. Скорее всего он будет достаточно простым, и его легко можно будет реализовать, используя официальную документацию TensorFlow 2. Но, на всякий случай (если не удлось сразу разобраться или хочется вникнуть в тему более глубоко), можете посмотреть следующий отличный туториал: https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.

### Numba

В некоторых ситуациях, при ручных реализациях графовых алгоритмов, выполнение многократных вложенных циклов for в python можно существенно ускорить, используя JIT-компилятор Numba (https://numba.pydata.org/).
Примеры использования Numba в Google Colab можно найти тут:
1. https://colab.research.google.com/github/cbernet/maldives/blob/master/numba/numba_cuda.ipynb
2. https://colab.research.google.com/github/evaneschneider/parallel-programming/blob/master/COMPASS_gpu_intro.ipynb

> Пожалуйста, если Вы решили использовать Numba для решения этого практического задания, еще раз подумайте, нужно ли это Вам, и есть ли возможность реализовать требуемую функциональность иным способом. Используйте Numba только при реальной необходимости.



### Работа с zip архивами в Google Drive

Запаковка и распаковка zip архивов может пригодиться при сохранении и загрузки Вашей модели. Ниже приведен фрагмент кода, иллюстрирующий помещение нескольких файлов в zip архив с последующим чтением файлов из него. Все действия с директориями, файлами и архивами должны осущетвляться с примонтированным Google Drive.


Создадим 2 изображения, поместим их в директорию tmp внутри PROJECT_DIR, запакуем директорию tmp в архив tmp.zip.

In [None]:
PROJECT_DIR = "/dev/prak_nn_1/"
arr1 = np.random.rand(100, 100, 3) * 255
arr2 = np.random.rand(100, 100, 3) * 255

img1 = Image.fromarray(arr1.astype('uint8'))
img2 = Image.fromarray(arr2.astype('uint8'))

p = "/content/drive/MyDrive/" + PROJECT_DIR

if not (Path(p) / 'tmp').exists():
    (Path(p) / 'tmp').mkdir()

img1.save(str(Path(p) / 'tmp' / 'img1.png'))
img2.save(str(Path(p) / 'tmp' / 'img2.png'))

%cd $p
!zip -r "tmp.zip" "tmp"

Распакуем архив tmp.zip в директорию tmp2 в PROJECT_DIR. Теперь внутри директории tmp2 содержится директория tmp, внутри которой находятся 2 изображения.

In [None]:
p = "/content/drive/MyDrive/" + PROJECT_DIR
%cd $p
!unzip -uq "tmp.zip" -d "tmp2"

In [None]:
!zip -r /content/file.zip /content/best_model.tf

  adding: content/best_model.tf/ (stored 0%)
  adding: content/best_model.tf/saved_model.pb (deflated 91%)
  adding: content/best_model.tf/assets/ (stored 0%)
  adding: content/best_model.tf/variables/ (stored 0%)
  adding: content/best_model.tf/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: content/best_model.tf/variables/variables.index (deflated 76%)
  adding: content/best_model.tf/keras_metadata.pb (deflated 96%)
  adding: content/best_model.tf/fingerprint.pb (stored 0%)


In [None]:
!unzip "best_model26.zip"

Archive:  best_model26.zip
   creating: content/best_model.tf/
  inflating: content/best_model.tf/saved_model.pb  
   creating: content/best_model.tf/assets/
   creating: content/best_model.tf/variables/
  inflating: content/best_model.tf/variables/variables.data-00000-of-00001  
  inflating: content/best_model.tf/variables/variables.index  
  inflating: content/best_model.tf/keras_metadata.pb  
 extracting: content/best_model.tf/fingerprint.pb  
