In [1]:
# Nice to have and only here as a reference until moved to its instructional home :)
#export CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)"))
#export SITE_PACKAGES_PATH=$(python -c "import site; print(site.getsitepackages()[0])")
#export LD_LIBRARY_PATH=$CUDNN_PATH/lib:$SITE_PACKAGES_PATH/tensorrt_libs/:$LD_LIBRARY_PATH

In [2]:
!conda list

# packages in environment at /home/flaniganp/mambaforge/envs/tensorflow-exercise-0:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
absl-py                   2.1.0                    pypi_0    pypi
anyio                     4.3.0                    pypi_0    pypi
argon2-cffi               23.1.0                   pypi_0    pypi
argon2-cffi-bindings      21.2.0                   pypi_0    pypi
arrow                     1.3.0                    pypi_0    pypi
asttokens                 2.4.1                    pypi_0    pypi
async-lru                 2.0.4                    pypi_0    pypi
attrs                     23.2.0                   pypi_0    pypi
babel                     2.14.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
bleach                    6.1.0          

In [3]:
# The os module in Python provides a way of using operating system dependent functionality. It allows you to interface
# with the underlying operating system that Python is running on â€“ be it Windows, Mac or Linux. You can use the os module
# to handle file and directory paths, create folders, list contents of a directory, manage environment variables, execute
# shell commands, and more.
import os

# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and
# matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import numpy as np

# TensorFlow is an open-source machine learning library developed by Google. It's used for both research and production
# at Google.
# * keras: Originally an independent neural network library, now integrated into TensorFlow, simplifies the creation and
#   training of deep learning models. Keras is known for its user-friendliness and modular approach, allowing for easy
#   and fast prototyping. It provides high-level building blocks for developing deep learning models while still
#   enabling users to dive into lower-level operations if needed.
from tensorflow import keras
# * tensorflow.python.client: Provides functionalities to query the properties of the hardware devices TensorFlow can
#   access. Specifically, this module is often used to list and get detailed information about the system's available
#   CPUs, GPUs, and other hardware accelerators compatible with TensorFlow.
from tensorflow.python.client import device_lib

# Versioning sourcing
from tensorflow import __version__ as tf_version

# TensorFlow Lite provides tools and classes for converting TensorFlow models into a highly optimized format suitable
# for deployment on mobile devices, embedded systems, or other platforms with limited computational capacity. This
# module includes functionalities for model conversion, optimization, and inference. By importing `lite`, you gain
# access to the TFLiteConverter class for model conversion, optimization options like quantization, and utilities for
# running TFLite models on target devices.
from tensorflow import lite

# Importing specific modules from keras, which is now part of TensorFlow
# Callbacks are utilities called at certain points during model training. EarlyStopping stops training when a monitored
# metric has stopped improving, and ModelCheckpoint saves the model after every epoch.
from keras.callbacks import EarlyStopping, ModelCheckpoint
# load_model is used to load a saved model. Sequential is a linear stack of layers.
from keras.models import load_model, Sequential
# Dense is a standard layer type that is used in many neural networks.
from keras.layers import Dense

# Regular Expressions
# 1. search: This function is used to perform a search for a pattern in a string and returns a match object if the
# pattern is found, otherwise None. It's particularly useful for string pattern matching and extracting specific
# segments from text.
from re import search

# Key aspects of 'check_output':
# 1. **Process Execution**: The 'check_output' function is used to run a command in the subprocess/external process and
#    capture its output. This is especially useful for running system commands and capturing their output directly
#    within a Python script.
# 2. **Return Output**: It returns the output of the command, making it available to the Python environment. If the
#    called command results in an error (non-zero exit status), it raises a CalledProcessError.
# 3. **Use Cases**: Common use cases include executing a shell command, reading the output of a command, automating
#    scripts that interact with the command line, and integrating external tools into a Python workflow.
# Example Usage:
# Suppose you want to capture the output of the 'ls' command in a Unix/Linux system. You can use 'check_output' like
# this:
# output = check_output(['ls', '-l'])
from subprocess import check_output
# Key aspects of 'CalledProcessError':
#  1. Error Handling: CalledProcessError is an exception raised by check_output when the command it tries to execute
#   returns a non-zero exit status, indicating failure. This exception is particularly useful for error handling in
#   scripts where the success of an external command is crucial.
#  2. Exception Details: The exception object contains information about the error, including the return code, command
#  executed, and output (if any). This aids in debugging by providing clear insights into why the external command
#  failed.
#  3. Handling the Exception: In practical use, it is often caught in a try-except block, allowing the script to respond
#  appropriately to the failure of the external command, like logging the error or trying a fallback operation.
from subprocess import CalledProcessError

2024-02-27 21:13:29.135115: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-27 21:13:29.152459: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-27 21:13:29.152475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-27 21:13:29.152928: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-27 21:13:29.155963: I tensorflow/core/platform/cpu_feature_guar

In [4]:
# Pertinent declarations
offset = 7
training_depth = 10000
model_path='../../models/exercise_1.h5'
quantized_model_path='../../models/exercise_1.tflite'

In [5]:
def create_model(model_path, offset, training_depth):
    # Creating a simple dataset
    training_range = np.arange(-training_depth, training_depth)  # np.arange creates evenly spaced values within a given interval.

    # This case is the simplest one I could find adding the offset to the training range to create a test range for
    # later prediction.
    test_range = training_range + offset  # Simple linear relationship for the target variable

    # In the context of neural networks, data types are crucial for managing memory and computational efficiency.
    # float32 is a common data type representing a 32-bit floating-point number.
    # It's widely used in neural network computations for a balance between precision and memory usage.
    digit = 'float32'

    # Reshaping and converting data type for TensorFlow compatibility
    # The -1 tells NumPy to calculate the size of this dimension automatically based on the length of the array and the
    # other given dimension, which is 1. This effectively transforms the array into a two-dimensional array with one
    # column and as many rows as necessary to accommodate all elements.
    x_train = training_range.reshape(-1, 1).astype(digit)
    y_train = test_range.reshape(-1, 1).astype(digit)

    # Building the neural network model
    # Dense layer with a single neuron. Input shape is 1 since our input has only one feature.
    model_1 = Sequential([
        Dense(1, input_shape=(1,))
    ])

    # Setting up the early stopping callback
    # Mean Absolute Error (MAE) is the average of the absolute differences between the predicted values and the actual
    # values. It measures how close the predictions of a model are to the actual outcomes.
    monitor_metric = 'mae'
    early_stopping_callback = EarlyStopping(
        monitor=monitor_metric,  # Monitor the mean absolute error
        patience=5  # Number of epochs with no improvement after which training will be stopped.
    )

    # Setting up the model checkpoint callback
    model_checkpoint_callback = ModelCheckpoint(
        filepath=model_path,  # File path to save the model
        save_best_only=True,  # Save only the model that has the best performance on the monitored metric
        monitor=monitor_metric,  # Metric to monitor
        mode='min'  # The training will aim to minimize the monitored metric
    )

    # Compile the model
    model_1.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
                    loss=keras.losses.mean_squared_error,
                    metrics=[monitor_metric])

    # Training the model
    model_1.fit(
        x_train,
        y_train,
        epochs=35,  # The number of times to iterate over the training data arrays
        batch_size=32,  # Number of samples per gradient update
        callbacks=[
            early_stopping_callback,  # Implementing early stopping
            model_checkpoint_callback  # Implementing model checkpoint saving
        ]
    )

    return model_1  # Returning the trained model

In [6]:
# This function `print_gpu_info` is designed to display detailed information about the available GPUs on the system.
# It utilizes TensorFlow's `device_lib.list_local_devices()` method to enumerate all computing devices recognized by
# TensorFlow. For each device identified as a GPU, the function extracts and prints relevant details including the GPU's
# ID, name, memory limit (converted to megabytes), and compute capability. The extraction of GPU information involves
# parsing the device's description string using regular expressions to find specific pieces of information. This
# function can be particularly useful for debugging or for setting up configurations in environments with multiple GPUs,
# ensuring that TensorFlow is utilizing the GPUs as expected.

def print_gpu_info():
    # Undocumented Method
    # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow
    # Get the list of all devices
    devices = device_lib.list_local_devices()

    for device in devices:
        if device.device_type == 'GPU':
            # Extract the physical device description
            desc = device.physical_device_desc

            # Use regular expressions to extract the required information
            gpu_id_match = search(r'device: (\d+)', desc)
            name_match = search(r'name: (.*?),', desc)
            compute_capability_match = search(r'compute capability: (\d+\.\d+)', desc)

            if gpu_id_match and name_match and compute_capability_match:
                gpu_id = gpu_id_match.group(1)
                gpu_name = name_match.group(1)
                compute_capability = compute_capability_match.group(1)

                # Convert memory limit from bytes to gigabytes and round it
                memory_limit_gb = round(device.memory_limit / (1024 ** 2))

                print(
                    f"\tGPU ID {gpu_id} --> {gpu_name} --> "
                    f"Memory Limit {memory_limit_gb} MB --> "
                    f"Compute Capability {compute_capability}")

In [7]:
# Hardware
print("Hardware Found:")
print_gpu_info()

Hardware Found:
	GPU ID 0 --> NVIDIA GeForce RTX 3090 --> Memory Limit 22018 MB --> Compute Capability 8.6


2024-02-27 21:13:29.806161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /device:GPU:0 with 22018 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


In [8]:
# NVIDIA Driver
try:
    # Execute the nvidia-smi command and decode the output
    nvidia_smi_output = check_output("nvidia-smi", shell=True).decode()

    # Split the output into lines
    lines = nvidia_smi_output.split('\n')

    # Find the line containing the driver version
    driver_line = next((line for line in lines if "Driver Version" in line), None)

    # Extract the driver version number
    if driver_line:
        driver_version = driver_line.split('Driver Version: ')[1].split()[0]
        print("NVIDIA Driver:", driver_version)

        # Extract the maximum supported CUDA version
        cuda_version = driver_line.split('CUDA Version: ')[1].strip().replace("|", "")
        print("Maximum Supported CUDA Version:", cuda_version)
    else:
        print("NVIDIA Driver Version or CUDA Version not found.")

except Exception as e:
    print("Error fetching NVIDIA Driver Version or CUDA Version:", e)

NVIDIA Driver: 545.23.08
Maximum Supported CUDA Version: 12.3     


In [9]:
print("Software Versions:")

# CUDA
try:
    # Execute the 'nvcc --version' command and decode the output
    nvcc_output = check_output("nvcc --version", shell=True).decode()

    # Use regular expression to find the version number
    match = search(r"V(\d+\.\d+\.\d+)", nvcc_output)
    if match:
        cuda_version = match.group(1)
        print("CUDA Version", cuda_version)
    else:
        print("CUDA Version not found")

except CalledProcessError as e:
    print("Error executing nvcc --version:", e)

Software Versions:
CUDA Version 11.8.89


In [11]:
# Create the model
model = create_model(model_path, offset, training_depth)

# Save the model
model.save(model_path)

2024-02-27 21:13:29.883823: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22018 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/35

2024-02-27 21:13:30.356513: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f0f114032d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-27 21:13:30.356525: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-02-27 21:13:30.358908: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-27 21:13:30.366769: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1709086410.405691   58933 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/35

  saving_api.save_model(


Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [12]:
# Loading the best saved model
saved_model = load_model(model_path)

In [13]:
# Using the model for prediction
predicted_depth = 100
base_x = np.arange(-predicted_depth, predicted_depth + 1, 10)  # New data for prediction
new_x_values = base_x.reshape(-1, 1)  # Reshaping data for prediction
predicted_y = saved_model.predict(new_x_values)  # Making predictions

# Show the new dataset and the associated predictions
print(f"Predicted Depth = {predicted_depth} with offset of {offset}")
for predicted_index in range(0, len(predicted_y.flatten())):
    print(f"Predicted {predicted_y.flatten()[predicted_index]} for {new_x_values.flatten()[predicted_index]}")

Predicted Depth = 100 with offset of 7
Predicted -93.0 for -100
Predicted -83.0 for -90
Predicted -73.0 for -80
Predicted -63.0 for -70
Predicted -53.0 for -60
Predicted -43.0 for -50
Predicted -33.0 for -40
Predicted -23.0 for -30
Predicted -13.000000953674316 for -20
Predicted -3.0000009536743164 for -10
Predicted 6.999999046325684 for 0
Predicted 17.0 for 10
Predicted 27.0 for 20
Predicted 37.0 for 30
Predicted 47.0 for 40
Predicted 57.0 for 50
Predicted 67.0 for 60
Predicted 77.0 for 70
Predicted 87.0 for 80
Predicted 97.0 for 90
Predicted 107.0 for 100


In [14]:
# Get the size of the model
model_size = os.path.getsize(model_path)

# Convert size to more readable format (e.g., in MB)
model_size_mb = model_size / (1024 * 1024)

print(f"Model size: {model_size} bytes, or {model_size_mb:.2f} MB")

Model size: 20240 bytes, or 0.02 MB


In [15]:
# Convert the model to the TensorFlow Lite format with quantization
converter = lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [lite.Optimize.DEFAULT]

# Generate a quantized model
tflite_model = converter.convert()

# Save the quantized model to a file
with open(quantized_model_path, 'wb') as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /tmp/tmpx367194k/assets


INFO:tensorflow:Assets written to: /tmp/tmpx367194k/assets
2024-02-27 21:13:41.016456: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:378] Ignored output_format.
2024-02-27 21:13:41.016467: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:381] Ignored drop_control_dependency.
2024-02-27 21:13:41.016649: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpx367194k
2024-02-27 21:13:41.016918: I tensorflow/cc/saved_model/reader.cc:51] Reading meta graph with tags { serve }
2024-02-27 21:13:41.016922: I tensorflow/cc/saved_model/reader.cc:146] Reading SavedModel debug info (if present) from: /tmp/tmpx367194k
2024-02-27 21:13:41.017822: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
2024-02-27 21:13:41.018033: I tensorflow/cc/saved_model/loader.cc:233] Restoring SavedModel bundle.
2024-02-27 21:13:41.030240: I tensorflow/cc/saved_model/loader.cc:217] Running initializatio

In [16]:
# Get the size of the model
model_size = os.path.getsize(quantized_model_path)

# Convert size to more readable format (e.g., in MB)
model_size_mb = model_size / (1024 * 1024)

print(f"Quantized Model size: {model_size} bytes, or {model_size_mb:.2f} MB")

Quantized Model size: 1084 bytes, or 0.00 MB


In [17]:
# Initialize the TensorFlow Lite interpreter
interpreter = lite.Interpreter(model_path=quantized_model_path)
interpreter.allocate_tensors()

# Get the input and output details for the interpreter
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Assuming `predicted_depth` and `offset` are defined as in your prompt
predicted_depth = 100
offset = 7  

# Generate new data for prediction
base_x = np.arange(-predicted_depth, predicted_depth + 1, 10).reshape(-1, 1)  # New data for prediction

# Ensure the input data type matches the model's expected input type
base_x = base_x.astype(input_details[0]['dtype'])

# Show the new dataset and the associated predictions
print(f"Quantized Predicted Depth = {predicted_depth} with offset of {offset}")

for i in range(base_x.shape[0]):
    # Set the tensor to the input index 0 (as most models have a single input)
    interpreter.set_tensor(input_details[0]['index'], base_x[i:i+1])
    
    # Run the interpreter
    interpreter.invoke()

    # Retrieve the prediction from the output tensor at index 0
    output_data = interpreter.get_tensor(output_details[0]['index'])
    
    # Print the prediction along with the corresponding input
    print(f"Quantized Predicted {output_data[0][0]} for {base_x[i][0]}")

Quantized Predicted Depth = 100 with offset of 7
Quantized Predicted -93.0 for -100.0
Quantized Predicted -83.0 for -90.0
Quantized Predicted -73.0 for -80.0
Quantized Predicted -63.0 for -70.0
Quantized Predicted -53.0 for -60.0
Quantized Predicted -43.0 for -50.0
Quantized Predicted -33.0 for -40.0
Quantized Predicted -23.0 for -30.0
Quantized Predicted -13.000000953674316 for -20.0
Quantized Predicted -3.0000009536743164 for -10.0
Quantized Predicted 6.999999046325684 for 0.0
Quantized Predicted 17.0 for 10.0
Quantized Predicted 27.0 for 20.0
Quantized Predicted 37.0 for 30.0
Quantized Predicted 47.0 for 40.0
Quantized Predicted 57.0 for 50.0
Quantized Predicted 67.0 for 60.0
Quantized Predicted 77.0 for 70.0
Quantized Predicted 87.0 for 80.0
Quantized Predicted 97.0 for 90.0
Quantized Predicted 107.0 for 100.0


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
