In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Load and preprocess the MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images, test_images = train_images / 255.0, test_images / 255.0  # Normalize pixel values to range [0, 1]

# Reshape the images to add a channel dimension (required for Conv2D)
train_images = train_images.reshape((train_images.shape[0], 28, 28, 1))
test_images = test_images.reshape((test_images.shape[0], 28, 28, 1))

# Create a simple CNN model for image classification
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model on the training set
model.fit(train_images, train_labels, epochs=5, batch_size=64)

# Evaluate the model on the testing set
test_loss, test_accuracy = model.evaluate(test_images, test_labels)
print("Test Accuracy:", test_accuracy)


  super().__init__(


Epoch 1/5
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.8745 - loss: 0.4264
Epoch 2/5
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9841 - loss: 0.0524
Epoch 3/5
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9880 - loss: 0.0369
Epoch 4/5
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9916 - loss: 0.0277
Epoch 5/5
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.9929 - loss: 0.0232
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9874 - loss: 0.0388
Test Accuracy: 0.9904999732971191


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, Model

# Define a model for processing image data
def create_image_model(input_shape):
    base_model = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    base_model.trainable = False
    
    global_average_layer = layers.GlobalAveragePooling2D()
    image_model = tf.keras.Sequential([
        base_model,
        global_average_layer,
    ])
    
    return image_model

# Define a model for processing text data
def create_text_model(vocab_size, embedding_dim, max_length):
    text_model = tf.keras.Sequential([
        layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        layers.GlobalAveragePooling1D(),
    ])
    
    return text_model

# Combine image and text models
def create_multimodal_model(image_input_shape, vocab_size, embedding_dim, max_length):
    image_model = create_image_model(image_input_shape)
    text_model = create_text_model(vocab_size, embedding_dim, max_length)
    
    # Define input layers for image and text
    image_input = layers.Input(shape=image_input_shape)
    text_input = layers.Input(shape=(max_length,))
    
    # Get outputs from image and text models
    image_output = image_model(image_input)
    text_output = text_model(text_input)
    
    # Concatenate outputs
    concatenated = layers.Concatenate()([image_output, text_output])
    output = layers.Dense(1, activation='sigmoid')(concatenated)
    
    model = Model(inputs=[image_input, text_input], outputs=output)
    
    return model

# Example usage
image_input_shape = (224, 224, 3)  # Example input shape for images
vocab_size = 10000  # Example vocabulary size for text
embedding_dim = 16  # Example embedding dimension for text
max_length = 100  # Example maximum length of text input

model = create_multimodal_model(image_input_shape, vocab_size, embedding_dim, max_length)
model.summary()




In [3]:
import numpy as np

# Generate random image and text data for demonstration purposes
num_samples = 1000
image_input_shape = (224, 224, 3)  # Example input shape for images
max_length = 100  # Example maximum length of text input
vocab_size = 10000  # Example vocabulary size for text
embedding_dim = 16  # Example embedding dimension for text

# Generate random image data
image_data = np.random.rand(num_samples, *image_input_shape)

# Generate random text data
text_data = np.random.randint(0, vocab_size, size=(num_samples, max_length))

# Generate random labels (binary classification task)
labels = np.random.randint(0, 2, size=(num_samples, 1))

# Split data into training and validation sets
train_image_data, val_image_data = image_data[:800], image_data[800:]
train_text_data, val_text_data = text_data[:800], text_data[800:]
train_labels, val_labels = labels[:800], labels[800:]

# Create and compile the multimodal model
model = create_multimodal_model(image_input_shape, vocab_size, embedding_dim, max_length)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit([train_image_data, train_text_data], train_labels, 
                    epochs=10, batch_size=32, 
                    validation_data=([val_image_data, val_text_data], val_labels))

# Evaluate the model
loss, accuracy = model.evaluate([val_image_data, val_text_data], val_labels)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 643ms/step - accuracy: 0.4945 - loss: 0.6976 - val_accuracy: 0.4850 - val_loss: 0.7161
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 643ms/step - accuracy: 0.5131 - loss: 0.6951 - val_accuracy: 0.5200 - val_loss: 0.7097
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 538ms/step - accuracy: 0.5182 - loss: 0.6931 - val_accuracy: 0.4800 - val_loss: 0.7201
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 564ms/step - accuracy: 0.5379 - loss: 0.6895 - val_accuracy: 0.5000 - val_loss: 0.6971
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 557ms/step - accuracy: 0.5583 - loss: 0.6868 - val_accuracy: 0.5200 - val_loss: 0.6948
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 553ms/step - accuracy: 0.5810 - loss: 0.6712 - val_accuracy: 0.5200 - val_loss: 0.7120
Epoch 7/10
[1m25/25[