# Flowerdata-module5 Classifier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

import torch
from torch.nn import functional as F

import torchvision
from torchvision import models
from torchvision import transforms
from torchvision.datasets import ImageFolder

import pytorch_lightning as pl

import os
import random
from tqdm import tqdm
from copy import copy

In [3]:
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext tensorboard

print(f'pl={pl.__version__}')
print(f'torch={torch.__version__}')
print(f'torchvision={torchvision.__version__}')

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [5]:
DATA_DIR = '/kaggle/input/flowerdata-module5/train/train'

BATCH_SIZE = 16
SEED = 20220421

In [6]:
dataset = ImageFolder(DATA_DIR)
dataset

In [7]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=1000, train_size=None, random_state=SEED)
X = [x[0] for x in dataset.samples]
y = [x[1] for x in dataset.samples]

train_idx, valid_idx = list(sss.split(X, y))[0]
print(len(train_idx), len(valid_idx))

In [8]:
train_dataset = copy(dataset)
train_dataset.samples = [dataset.samples[i] for i in train_idx]
train_dataset.targets = [dataset.targets[i] for i in train_idx]
train_dataset.imgs = train_dataset.samples

valid_dataset = copy(dataset)
valid_dataset.samples = [dataset.samples[i] for i in valid_idx]
valid_dataset.targets = [dataset.targets[i] for i in valid_idx]
valid_dataset.imgs = valid_dataset.samples

print(len(train_dataset), len(valid_dataset))

In [9]:
def plot_samples(dataset, num_samples: int):
    fig, axes = plt.subplots(1, num_samples, figsize=(16, 8))
    for i in range(num_samples):
        random_image, random_class = random.choice(dataset)
        random_label = dataset.classes[random_class]
        axes[i].imshow(random_image)
        axes[i].set_title(random_label.replace("_", " "))
        axes[i].axis("off")
    plt.show()

In [10]:
plot_samples(train_dataset, num_samples=10)

In [11]:
plot_samples(valid_dataset, num_samples=10)

In [12]:
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225],
)

train_dataset.transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize,
])

valid_dataset.transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

test_transformations = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

In [13]:
NUM_WORKERS = 4 # or os.cpu_count()

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
)

In [14]:
class FlowersModel(pl.LightningModule):
    def __init__(self):
        super().__init__()

        # используем предобученную модель ResNet-34
        self.net = models.resnet34(pretrained=True)
        # замораживаем веса
        for parameter in self.net.parameters():
            parameter.requres_grad = False
        # заменяем полносвязный слой на новый под нужное количество классов
        self.net.fc = torch.nn.Linear(512, len(train_dataset.classes))

    def forward(self, x):
        return self.net(x)

    def training_step(self, batch, batch_nb):
        images, target = batch
        output = self(images)
        # используем "функциональную" версию функции потерь вместо инстанцирования модуля
        loss = F.cross_entropy(output, target)
        with torch.no_grad():
            top1 = torch.mean((output.argmax(1) == target).float()).item()

        # логируем метрики — они будут как в прогрессбаре внизу, так и в отдельной директории
        self.log('loss/train', loss, on_step=False, on_epoch=True)
        self.log('top1/train', top1, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        images, target = batch
        output = self(images)
        loss = loss = F.cross_entropy(output, target)
        top1 = torch.mean((output.argmax(1) == target).float()).item()

        self.log('loss/val', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('top1/val', top1, on_step=False, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.SGD(self.net.parameters(), lr=1e-2, weight_decay=1e-4)

In [15]:
model = FlowersModel()

In [None]:
trainer = pl.Trainer(
    gpus=1, # amount of GPU we wanted to use
    max_epochs=30,
    progress_bar_refresh_rate=1,
)
trainer.fit(model, train_loader, valid_loader)

In [None]:
train_dataset = tf.keras.utils.image_dataset_from_directory(
    DATA_DIR,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
#     label_mode='categorical',
    shuffle=True,
    seed=SEED,
    validation_split=VALIDATION_SPLIT,
    subset='training',
)
validation_dataset = tf.keras.utils.image_dataset_from_directory(
    DATA_DIR,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
#     label_mode='categorical',
    shuffle=True,
    seed=SEED,
    validation_split=VALIDATION_SPLIT,
    subset='validation',
)

In [None]:
class_names = train_dataset.class_names

plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        img = images[i].numpy()
        label = labels[i].numpy()
#         class_index = label.argmax()
#         label = class_names[class_index]
        plt.imshow(img.astype("uint8"))
        plt.title(label)
        plt.axis("off")

In [None]:
NUM_CLASSES = len(train_dataset.class_names)
print(f'NUM_CLASSES={NUM_CLASSES}')

In [None]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2),
])

In [None]:
for batch, _ in train_dataset.take(1):
  plt.figure(figsize=(10, 10))
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    augmented = data_augmentation(batch)
    image = augmented[0]
    plt.imshow(image / 255)
    plt.axis('off')

## Create Model

In [None]:
preprocess_input = tf.keras.applications.efficientnet.preprocess_input

In [None]:
base_model = tf.keras.applications.EfficientNetB4(
    input_shape=IMAGE_SIZE + (3,),
    include_top=False,
    weights='imagenet',
)
base_model.trainable = False

inputs = tf.keras.Input(shape=IMAGE_SIZE + (3,))
x = data_augmentation(inputs)

# x = layers.Rescaling(1.0 / 255)(x)
x = preprocess_input(x)

x = base_model(x, training=False)
# x = global_average_layer(x)
# x = tf.keras.layers.Dropout(0.2)(x)
# outputs = prediction_layer(x)

x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)

tf.keras.utils.plot_model(model, show_shapes = True)

In [None]:
early_stoping = tf.keras.callbacks.EarlyStopping(
    monitor='val_sparse_categorical_accuracy', 
#     monitor='val_categorical_accuracy', 
    mode='max', 
    verbose=1, 
    patience=7
)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath = "best_model.hdf5",
    monitor='val_sparse_categorical_accuracy', 
#     monitor='val_categorical_accuracy', 
    mode='max', 
    verbose=1, 
    save_best_only=True
)
# keras.callbacks.ModelCheckpoint("save_at_{epoch}.h5"),

learning_rate_reduction = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_sparse_categorical_accuracy', 
#     monitor='val_categorical_accuracy', 
    factor=0.6, 
    min_lr=1e-7
)

## Configure fine tune parameters

In [None]:
base_model.trainable = True

base_model_layers = len(base_model.layers)
no_fine_tune_layers = int(base_model_layers * 3/4)

print(f'{base_model_layers - no_fine_tune_layers} trainable of {base_model_layers} layers')

for layer in base_model.layers[:no_fine_tune_layers]:
  layer.trainable =  False

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(STEP),
    loss="sparse_categorical_crossentropy",
    metrics=["SparseCategoricalAccuracy"]
)
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(STEP),
#     loss="categorical_crossentropy",
#     metrics=["CategoricalAccuracy"]
# )

In [None]:
EPOCHS = 6
history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    validation_data=validation_dataset,
    callbacks=[early_stoping, learning_rate_reduction, model_checkpoint],
)

In [None]:
# class_index_to_label = [
#     'DAISY',
#     'DANDELION',
#     'ROSE',
#     'SUNFLOWER',
#     'TULIP'
# ]

# validation_answers = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Flowers/sample_submission.csv')     #(PATH+'sample_submission.csv')
model_answers = pd.DataFrame()
for PATH in submission['Id']:
    img = tf.keras.preprocessing.image.load_img(
        PATH, target_size=IMAGE_SIZE
    )
    img = np.asarray(img)/255
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)

    predictions = model.predict(img_array)
    predicted_class_index = np.argmax(predictions, axis=1)[0]
    category = predicted_class_index
#     class_index_to_label[predicted_class_index]
    model_answers = model_answers.append({'Id': PATH, 'Category': category}, ignore_index=True)

model_answers

In [None]:
loss, accuracy = model.evaluate(validation_dataset)
print("Accuracy", accuracy)

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 3])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(history)

In [None]:
def predict(model, path: str) -> int:
    img = tf.keras.preprocessing.image.load_img(
        path, target_size=IMAGE_SIZE
    )
    img = np.asarray(img) / 255
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)

    predictions = model.predict(img_array)
#     return predictions/
    return np.argmax(predictions[0])
#     predicted_class_index = np.argmax(predictions, axis=1)[0]
#     return int(predicted_class_index)

In [None]:
x = '/kaggle/input/flowerdata-module5/train/train/73/image_00431.jpg'
x = tf.io.read_file(x)
x = tf.image.decode_jpeg(x, channels=3)
x = tf.image.convert_image_dtype(x, dtype=tf.float32)
x = tf.image.resize(x, IMAGE_SIZE)
# image = batch[0]
x = tf.cast(tf.expand_dims(x, 0), tf.float32)
x = model.predict(x)
i = np.argmax(x[0])
# x[0][i]
i

In [None]:
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])

# predictions = probability_model.predict(test_images)
# predict(probability_model, '/kaggle/input/flowerdata-module5/test/test/image_00061.jpg')
predict(model, '/kaggle/input/flowerdata-module5/test/test/image_00028.jpg')

In [None]:
submission = pd.read_csv('/kaggle/input/flowerdata-module5/sample_submission.csv')
submission.head()

In [None]:
df['Category']

In [None]:
df = pd.DataFrame()
for file in submission['Id']:
    label = predict(model, f'/kaggle/input/flowerdata-module5/test/test/{file}')
    df = df.append({
        'Id': file,
        'Category': label
    }, ignore_index=True)
df.head()

In [None]:
df.to_csv('submission.csv', index=False)