In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import cifar100
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.models import Model
import numpy as np

# Load CIFAR-100 dataset
(x_train, y_train), (x_test, y_test) = cifar100.load_data()

# Normalize pixel values to [0,1] (important for MobileNetV2)
x_train, x_test = x_train / 255.0, x_test / 255.0

# Print dataset shape
print("Train images shape:", x_train.shape)
print("Train labels shape:", y_train.shape)
print("Test images shape:", x_test.shape)
print("Test labels shape:", y_test.shape)

# Convert CIFAR-100 labels to sentiment labels
def map_to_sentiment(y):
    sentiment_labels = []
    for label in y:
        if label < 34:
            sentiment_labels.append(0)  # Negative
        elif label < 67:
            sentiment_labels.append(1)  # Neutral
        else:
            sentiment_labels.append(2)  # Positive
    return np.array(sentiment_labels)

# Apply mapping
y_train_sentiment = map_to_sentiment(y_train.flatten())
y_test_sentiment = map_to_sentiment(y_test.flatten())

# Check distribution
unique, counts = np.unique(y_train_sentiment, return_counts=True)
print("Sentiment Distribution in Training Set:", dict(zip(unique, counts)))

# Load IMDB dataset
vocab_size = 10000
max_length = 200

(x_train_text, y_train_text), (x_test_text, y_test_text) = imdb.load_data(num_words=vocab_size)

# Pad sequences for uniform input size
x_train_text = pad_sequences(x_train_text, maxlen=max_length, padding='post')
x_test_text = pad_sequences(x_test_text, maxlen=max_length, padding='post')

print("Text dataset shape:", x_train_text.shape)

# Determine the smaller dataset size between CIFAR and IMDB for training
min_train_size = min(x_train.shape[0], x_train_text.shape[0])

# Slice both datasets to match the smaller size
x_train = x_train[:min_train_size]
y_train_sentiment = y_train_sentiment[:min_train_size]
x_train_text = x_train_text[:min_train_size]

# Determine the smaller dataset size between CIFAR and IMDB for testing
min_test_size = min(x_test.shape[0], x_test_text.shape[0])

# Slice both datasets to match the smaller size
x_test = x_test[:min_test_size]
y_test_sentiment = y_test_sentiment[:min_test_size]
x_test_text = x_test_text[:min_test_size]

# Image Model (MobileNetV2)
image_input = Input(shape=(32, 32, 3))
base_model = MobileNetV2(include_top=False, input_tensor=image_input, weights=None)  # No pre-trained weights
image_features = GlobalAveragePooling2D()(base_model.output)

# Text Model (LSTM)
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length)(text_input)
lstm_layer = LSTM(64)(embedding_layer)

# Concatenate Text + Image Features
merged = Concatenate()([image_features, lstm_layer])
output = Dense(3, activation='softmax')(merged)  # 3 sentiment classes: Negative, Neutral, Positive

# Define Model
fusion_model = Model(inputs=[image_input, text_input], outputs=output)

# Compile Model
fusion_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print Model Summary
fusion_model.summary()

# Train the model
fusion_model.fit(
    [x_train, x_train_text], y_train_sentiment,
    validation_data=([x_test, x_test_text], y_test_sentiment),
    epochs=10,
    batch_size=64
)

Train images shape: (50000, 32, 32, 3)
Train labels shape: (50000, 1)
Test images shape: (10000, 32, 32, 3)
Test labels shape: (10000, 1)
Sentiment Distribution in Training Set: {0: 17000, 1: 16500, 2: 16500}
Text dataset shape: (25000, 200)




Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 392ms/step - accuracy: 0.3450 - loss: 1.2700 - val_accuracy: 0.3400 - val_loss: 1.0991
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 403ms/step - accuracy: 0.3992 - loss: 1.0981 - val_accuracy: 0.3251 - val_loss: 1.1104
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 358ms/step - accuracy: 0.4751 - loss: 1.0221 - val_accuracy: 0.3289 - val_loss: 1.1602
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 405ms/step - accuracy: 0.5491 - loss: 0.8990 - val_accuracy: 0.3283 - val_loss: 1.2762
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 345ms/step - accuracy: 0.5963 - loss: 0.7946 - val_accuracy: 0.3259 - val_loss: 1.4524
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 338ms/step - accuracy: 0.6437 - loss: 0.7041 - val_accuracy: 0.3261 - val_loss: 1.6538
Epoc

<keras.src.callbacks.history.History at 0x783b67fb8790>