https://machinelearningmastery.com/practical-guide-to-gan-failure-modes/

In [1]:
# example of training a stable gan for generating a handwritten digit
from os import makedirs
from numpy import expand_dims
from numpy import zeros
from numpy import ones
from numpy.random import randn
from numpy.random import randint
from keras.datasets.mnist import load_data
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Reshape
from keras.layers import Flatten
from keras.layers import Conv2D
from keras.layers import Conv2DTranspose
from keras.layers import LeakyReLU
from keras.initializers import RandomNormal
from matplotlib import pyplot
import os

2024-04-22 10:42:19.207497: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 10:42:19.207522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 10:42:19.208443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-22 10:42:19.213821: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
FILE_PATH = os.path.dirname(os.path.abspath("__file__"))
print(FILE_PATH)


/home/thien/Code/Torch_TensorFlow_tutor/Torch_more_Advanced/GANs_tutorial/Web2_TF


In [4]:
# define the standalone discriminator model
def define_discriminator(in_shape=(28,28,1)):
	# weight initialization
	init = RandomNormal(stddev=0.02)
	# define model
	model = Sequential()
	# downsample to 14x14
	model.add(Conv2D(64, (4,4), strides=(2,2), padding='same', kernel_initializer=init, input_shape=in_shape))
	model.add(LeakyReLU(alpha=0.2))
	# downsample to 7x7
	model.add(Conv2D(64, (4,4), strides=(2,2), padding='same', kernel_initializer=init))
	model.add(LeakyReLU(alpha=0.2))
	# classifier
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	# compile model
	opt = Adam(lr=0.0002, beta_1=0.5)
	model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
	return model



In [5]:
# define the standalone generator model
def define_generator(latent_dim):
	# weight initialization
	init = RandomNormal(stddev=0.02)
	# define model
	model = Sequential()
	# foundation for 7x7 image
	n_nodes = 128 * 7 * 7
	model.add(Dense(n_nodes, kernel_initializer=init, input_dim=latent_dim))
	model.add(LeakyReLU(alpha=0.2))
	model.add(Reshape((7, 7, 128)))
	# upsample to 14x14
	model.add(Conv2DTranspose(128, (4,4), strides=(2,2), padding='same', kernel_initializer=init))
	model.add(LeakyReLU(alpha=0.2))
	# upsample to 28x28
	model.add(Conv2DTranspose(128, (4,4), strides=(2,2), padding='same', kernel_initializer=init))
	model.add(LeakyReLU(alpha=0.2))
	# output 28x28x1
	model.add(Conv2D(1, (7,7), activation='tanh', padding='same', kernel_initializer=init))
	return model



In [6]:
# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
	# make weights in the discriminator not trainable
	discriminator.trainable = False
	# connect them
	model = Sequential()
	# add generator
	model.add(generator)
	# add the discriminator
	model.add(discriminator)
	# compile model
	opt = Adam(lr=0.0002, beta_1=0.5)
	model.compile(loss='binary_crossentropy', optimizer=opt)
	return model



In [7]:
# load mnist images
def load_real_samples():
	# load dataset
	(trainX, trainy), (_, _) = load_data()
	# expand to 3d, e.g. add channels
	X = expand_dims(trainX, axis=-1)
	# select all of the examples for a given class
	selected_ix = trainy == 8
	X = X[selected_ix]
	# convert from ints to floats
	X = X.astype('float32')
	# scale from [0,255] to [-1,1]
	X = (X - 127.5) / 127.5
	return X



In [8]:
# select real samples
def generate_real_samples(dataset, n_samples):
	# choose random instances
	ix = randint(0, dataset.shape[0], n_samples)
	# select images
	X = dataset[ix]
	# generate class labels
	y = ones((n_samples, 1))
	return X, y



In [9]:
# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples):
	# generate points in the latent space
	x_input = randn(latent_dim * n_samples)
	# reshape into a batch of inputs for the network
	x_input = x_input.reshape(n_samples, latent_dim)
	return x_input



In [10]:
# use the generator to generate n fake examples, with class labels
def generate_fake_samples(generator, latent_dim, n_samples):
	# generate points in latent space
	x_input = generate_latent_points(latent_dim, n_samples)
	# predict outputs
	X = generator.predict(x_input)
	# create class labels
	y = zeros((n_samples, 1))
	return X, y



In [11]:
# generate samples and save as a plot and save the model
def summarize_performance(step, g_model, latent_dim, n_samples=100):
	# prepare fake examples
	X, _ = generate_fake_samples(g_model, latent_dim, n_samples)
	# scale from [-1,1] to [0,1]
	X = (X + 1) / 2.0
	# plot images
	for i in range(10 * 10):
		# define subplot
		pyplot.subplot(10, 10, 1 + i)
		# turn off axis
		pyplot.axis('off')
		# plot raw pixel data
		pyplot.imshow(X[i, :, :, 0], cmap='gray_r')
	# save plot to file
	pyplot.savefig(FILE_PATH+'/results_baseline/generated_plot_%03d.png' % (step+1))
	pyplot.close()
	# save the generator model
	g_model.save(FILE_PATH+'/results_baseline/model_%03d.h5' % (step+1))



In [12]:
# create a line plot of loss for the gan and save to file
def plot_history(d1_hist, d2_hist, g_hist, a1_hist, a2_hist):
	# plot loss
	pyplot.subplot(2, 1, 1)
	pyplot.plot(d1_hist, label='d-real')
	pyplot.plot(d2_hist, label='d-fake')
	pyplot.plot(g_hist, label='gen')
	pyplot.legend()
	# plot discriminator accuracy
	pyplot.subplot(2, 1, 2)
	pyplot.plot(a1_hist, label='acc-real')
	pyplot.plot(a2_hist, label='acc-fake')
	pyplot.legend()
	# save plot to file
	pyplot.savefig(FILE_PATH+'/results_baseline/plot_line_plot_loss.png')
	pyplot.close()



In [13]:
# train the generator and discriminator
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs=10, n_batch=128):
	# calculate the number of batches per epoch
	bat_per_epo = int(dataset.shape[0] / n_batch)
	# calculate the total iterations based on batch and epoch
	n_steps = bat_per_epo * n_epochs
	# calculate the number of samples in half a batch
	half_batch = int(n_batch / 2)
	# prepare lists for storing stats each iteration
	d1_hist, d2_hist, g_hist, a1_hist, a2_hist = list(), list(), list(), list(), list()
	# manually enumerate epochs
	for i in range(n_steps):
		# get randomly selected 'real' samples
		X_real, y_real = generate_real_samples(dataset, half_batch)
		# update discriminator model weights
		d_loss1, d_acc1 = d_model.train_on_batch(X_real, y_real)
		# generate 'fake' examples
		X_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
		# update discriminator model weights
		d_loss2, d_acc2 = d_model.train_on_batch(X_fake, y_fake)
		# prepare points in latent space as input for the generator
		X_gan = generate_latent_points(latent_dim, n_batch)
		# create inverted labels for the fake samples
		y_gan = ones((n_batch, 1))
		# update the generator via the discriminator's error
		g_loss = gan_model.train_on_batch(X_gan, y_gan)
		# summarize loss on this batch
		print('>%d, d1=%.3f, d2=%.3f g=%.3f, a1=%d, a2=%d' %
			(i+1, d_loss1, d_loss2, g_loss, int(100*d_acc1), int(100*d_acc2)))
		# record history
		d1_hist.append(d_loss1)
		d2_hist.append(d_loss2)
		g_hist.append(g_loss)
		a1_hist.append(d_acc1)
		a2_hist.append(d_acc2)
		# evaluate the model performance every 'epoch'
		if (i+1) % bat_per_epo == 0:
			summarize_performance(i, g_model, latent_dim)
	plot_history(d1_hist, d2_hist, g_hist, a1_hist, a2_hist)



In [14]:
# make folder for results
makedirs(FILE_PATH+'/results_baseline', exist_ok=True)
# size of the latent space
latent_dim = 50
# create the discriminator
discriminator = define_discriminator()
# create the generator
generator = define_generator(latent_dim)
# create the gan
gan_model = define_gan(generator, discriminator)
# load image data
dataset = load_real_samples()
print(dataset.shape)


2024-04-22 10:42:25.547487: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 182 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:1a:00.0, compute capability: 7.5
2024-04-22 10:42:25.548079: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 8768 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:67:00.0, compute capability: 7.5
2024-04-22 10:42:25.548592: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 8704 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:68:00.0, compute capability: 7.5
2024-04-22 10:42:25.562169: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 182.69MiB (191561728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


(5851, 28, 28, 1)


In [15]:
# train model
train(generator, discriminator, gan_model, dataset, latent_dim)

2024-04-22 10:42:28.947172: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:459] Could not create cudnn handle: CUDNN_STATUS_NOT_INITIALIZED
2024-04-22 10:42:28.947228: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:463] Memory usage: 2818048 bytes free, 11546394624 bytes total.
2024-04-22 10:42:28.947261: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Possibly insufficient driver version: 525.85.12
2024-04-22 10:42:28.947281: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at conv_ops_impl.h:1199 : UNIMPLEMENTED: DNN library is not found.


UnimplementedError: Graph execution error:

Detected at node sequential/conv2d/Conv2D defined at (most recent call last):
  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_252643/1429010400.py", line 2, in <module>

  File "/tmp/ipykernel_252643/927828072.py", line 16, in train

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 2787, in train_on_batch

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/sequential.py", line 398, in call

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py", line 290, in call

  File "/home/thien/miniconda3/envs/TF310_GPU/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py", line 262, in convolution_op

DNN library is not found.
	 [[{{node sequential/conv2d/Conv2D}}]] [Op:__inference_train_function_1154]