In [1]:
import os
os.chdir('../')

In [2]:
from Emotion_Detector.utils import *
from Emotion_Detector.utils.model_store import ResNet18
from Emotion_Detector.constants import PARAMS_FILE_PATH, CONFIG_FILE_PATH
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    # Set memory growth to False to allocate the entire GPU memory
    tf.config.experimental.set_memory_growth(physical_devices[0], False)

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ResnetModelTrainingConfig:
    root_dir: Path
    checkpoints_callback_dir : Path
    tensorboard_log_dir : Path
    model_history_dir : Path
    model_dir : Path
    model_file_path : str
    checkpoints_file_path : str
    model_history_file_path : Path
    model_accuracy_plot_path : Path
    model_loss_plot_path : Path
    train_dir: Path
    val_dir: Path
    param_resnet_epoch: int
    param_batch_size: int
    all_params : dict

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_path = CONFIG_FILE_PATH,
            params_path = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        create_directories([self.config.artifacts_root])

    def get_resnet_model_training_config(self) -> ResnetModelTrainingConfig:
        config = self.config.resnet_model_training


        create_directories([config.root_dir])
        create_directories([config.checkpoints_callback_dir])
        create_directories([config.tensorboard_log_dir])
        create_directories([config.model_history_dir])
        create_directories([config.model_dir])

        model_training_config = ResnetModelTrainingConfig(
            root_dir = Path(config.root_dir),
            checkpoints_callback_dir = Path(config.checkpoints_callback_dir),
            tensorboard_log_dir = Path(config.tensorboard_log_dir),
            model_history_dir = Path(config.model_history_dir),
            model_dir = Path(config.model_dir),
            model_file_path =  config.model_file_path,
            checkpoints_file_path = config.checkpoints_file_path,
            model_history_file_path = Path(config.model_history_file_path),
            model_accuracy_plot_path = Path(config.model_accuracy_plot_path),
            model_loss_plot_path = Path(config.model_loss_plot_path),
            train_dir = Path(config.train_dir),
            val_dir = Path(config.val_dir),
            param_resnet_epoch= self.params.RESNET_EPOCH,
            param_batch_size  = self.params.RESNET_TRAIN_BATCH,
            all_params = self.params
        )
        return model_training_config

In [6]:
class Resnet_Model_Training:
    def __init__(self, config: ResnetModelTrainingConfig):
        self.config = config

    def _get_data(self):
        train_data, val_data = get_augmented_data(train_dir_path= self.config.train_dir,
                                                  val_dir_path= self.config.val_dir,
                                                  params= self.config.all_params
                                                  )
        return train_data, val_data
        
    def _get_model(self):
        input_shape = (1, self.config.all_params['IMAGE_SIZE'], self.config.all_params['IMAGE_SIZE'], 3)
        print(self.config.all_params)
        lenet_model = ResNet18()
        lenet_model.build(input_shape = input_shape)
        return lenet_model
    
    def train_model(self):
        
        logger.info(f"Reconstructing the data from tensorflow records...")
        train_data, val_data = self._get_data()
        logger.info(f"Data loaded successfully.")

        logger.info(f"Geting the Sub Classed Lenet Model...")
        model = self._get_model()
        logger.info(f"Model Loaded Successfully...")

        logger.info(f"Preparing Checkpoint Callback...")
        checkpoint_callback = ModelCheckpoint(
            filepath= self.config.checkpoints_file_path,
            save_weights_only=True,
            save_best_only=True,
            monitor='val_loss',
            verbose=1
        )

        logger.info(f"Preparing Tensorboard Callback...")
        tensorboard_callback = TensorBoard(log_dir=self.config.tensorboard_log_dir, histogram_freq=1)


        loss_function = CategoricalCrossentropy()
        metrics = [CategoricalAccuracy(name = "accuracy")]

        logger.info(f"Compiling the model...")
        model.compile(
                optimizer = Adam(learning_rate= 0.001*10),
                loss = loss_function,
                metrics = metrics
            )
        
        logger.info(f"Starting training with {self.config.param_resnet_epoch} epochs and validation data...")
        history = model.fit(train_data,batch_size = 8, epochs = self.config.param_resnet_epoch,validation_data = val_data, callbacks = [checkpoint_callback, tensorboard_callback], verbose = 1)
        logger.info(f"Model Training Completed Successfully.")
        logger.info(f"Model Summary /n {model.summary()}")

        logger.info(f"Saving the model at {self.config.model_dir}...")
        model.save(self.config.model_dir, save_format = 'tf')
        logger.info(f"Model successfully saved at {self.config.model_dir}.")

        logger.info(f"Saving the model history at {self.config.model_history_file_path}...")
        save_json(path = self.config.model_history_file_path, data = history.history)
        logger.info(f"Model history successfully saved at {self.config.model_history_file_path}.")

        logger.info(f"Saving the model accuracy plot at {self.config.model_accuracy_plot_path}...")
        save_plt_fig(x = history.history['accuracy'],
                     y = history.history['val_accuracy'],
                     title = "Model Accuracy",
                     xlabel ='Epochs',
                     ylabel = "Accuracy",
                     legends= ['Train', 'Validation'],
                     fig_path = self.config.model_accuracy_plot_path)
        
        logger.info(f"Saving the model loss plot at {self.config.model_loss_plot_path}...")
        save_plt_fig(x = history.history['loss'],
                     y = history.history['val_loss'],
                     title = "Model loss",
                     xlabel ='Epochs',
                     ylabel = "Loss",
                     legends= ['Train', 'Validation'],
                     fig_path = self.config.model_loss_plot_path)
        logger.info(f"Model results saved successfully.")

In [7]:
try:
    config = ConfigurationManager()
    resnet_model_training_config = config.get_resnet_model_training_config()
    model_trainer = Resnet_Model_Training(config = resnet_model_training_config)
    model_trainer.train_model() 
except Exception as e:
    raise e

Found 28709 files belonging to 7 classes.
Found 7178 files belonging to 7 classes.
{'CLASS_NAMES': ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'], 'RANDOM_SEED': 22, 'BATCH_SIZE': 32, 'IMAGE_SIZE': 256, 'LEARNING_RATE': 0.001, 'EPOCHS': 50, 'DROPOUT_RATE': 0.0, 'REGULARIZATION_RATE': 0.0, 'N_FILTERS': 6, 'KERNAL_SIZE': 3, 'N_STRIDES': 1, 'POOL_SIZE': 2, 'N_DENSE_1': 100, 'N_DENSE_2': 10, 'NUM_CLASSES': 7, 'SHUFFLE': True, 'TRAIN_NUM_SHARDS': 10, 'TEST_NUM_SHARDS': 5, 'RESNET_EPOCH': 25, 'RESNET_TRAIN_BATCH': 4, 'RANDOM_ROTATION_LEFT_FACTOR': -0.025, 'RANDOM_ROTATION_RIGHT_FACTOR': 0.025, 'RANDOM_FLIP_MODE': 'horizontal', 'RANDOMTRANSLATION_HEIGHT_FACTOR_LEFT': -0.1, 'RANDOMTRANSLATION_HEIGHT_FACTOR_RIGHT': 0.1, 'RANDOMTRANSLATION_WIDTH_FACTOR_LEFT': -0.1, 'RANDOMTRANSLATION_WIDTH_FACTOR_RIGHT': 0.1, 'RANDOM_ZOOM_FACTOR': 0.2}
Epoch 1/25
Epoch 1: val_loss improved from inf to 1.81195, saving model to artifacts/resnet_model_training/model_checkpoints\model_01.ckpt
Ep

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/resnet_18/custom_conv2d/batch_normalization/FusedBatchNormGradV3' defined at (most recent call last):
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
      app.start()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\asyncio\base_events.py", line 539, in run_forever
      self._run_once()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\asyncio\base_events.py", line 1775, in _run_once
      handle._run()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\asyncio\events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\interactiveshell.py", line 2975, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3029, in _run_cell
      return runner(coro)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3257, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3472, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3552, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\sv255\AppData\Local\Temp\ipykernel_14836\225592097.py", line 5, in <module>
      model_trainer.train_model()
    File "C:\Users\sv255\AppData\Local\Temp\ipykernel_14836\1902641002.py", line 53, in train_model
      history = model.fit(train_data,batch_size = 8, epochs = self.config.param_resnet_epoch,validation_data = val_data, callbacks = [checkpoint_callback, tensorboard_callback], verbose = 1)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\engine\training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 577, in minimize
      loss, var_list=var_list, grad_loss=grad_loss, tape=tape
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 635, in _compute_gradients
      tape, loss, var_list, grad_loss
    File "c:\Users\sv255\anaconda3\envs\py37gpu\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/resnet_18/custom_conv2d/batch_normalization/FusedBatchNormGradV3'
OOM when allocating tensor with shape[32,64,128,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/resnet_18/custom_conv2d/batch_normalization/FusedBatchNormGradV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_13168]

In [8]:
model = tf.saved_model.load("artifacts/resnet_model_training/model_checkpoints\model_14.ckpt")

OSError: SavedModel file does not exist at: artifacts/resnet_model_training/model_checkpoints\model_14.ckpt\{saved_model.pbtxt|saved_model.pb}

In [None]:
from Emotion_Detector.utils import reconstruct_data_from_tfrecords

In [None]:
import os
os.chdir('../')

In [None]:
path = 'artifacts/data_preprocessing/train_tfrecords/shard_{:02d}.tfrecord'
num_shards = 10
batch_size = 32

In [None]:
parsed_data = reconstruct_data_from_tfrecords(path, num_shards, batch_size )